diff options
author | Georgios Pinitas <georgios.pinitas@arm.com> | 2021-08-20 21:39:25 +0100 |
---|---|---|
committer | Georgios Pinitas <georgios.pinitas@arm.com> | 2021-08-25 16:23:15 +0000 |
commit | 7891a73ef36f4ad7b71069b3c57694f85bb79454 (patch) | |
tree | 5b08692989e28ce63de2937d8d92ea5176589dbe /src/cpu/kernels | |
parent | a46c9c98c2b1d70acc7c6eee00e2cdc2a1e209a6 (diff) | |
download | ComputeLibrary-7891a73ef36f4ad7b71069b3c57694f85bb79454.tar.gz |
Move CPU/GPU files from Core/Runtime to the respective backend folders
Legacy structure contained two libraries core/runtime with two backends
in each.
We reduce the core/runtime libraries to a single library thus merging
the backend files
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I69545765fe7a730368105cdbd067d3135ec7a174
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6155
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/cpu/kernels')
154 files changed, 37006 insertions, 0 deletions
diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp new file mode 100644 index 0000000000..8fa7e9525e --- /dev/null +++ b/src/cpu/kernels/CpuActivationKernel.cpp @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuActivationKernel.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "src/core/common/Registrars.h" +#include "src/cpu/kernels/activation/list.h" + +#include <array> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +struct ActivationSelectorData +{ + DataType dt; + const CPUInfo &ci; +}; + +using ActivationSelectorPtr = std::add_pointer<bool(const ActivationSelectorData &data)>::type; +using ActivationKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type; + +struct ActivationKernel +{ + const char *name; + const ActivationSelectorPtr is_selected; + ActivationKernelPtr ukernel; +}; + +static const ActivationKernel available_kernels[] = +{ +#if defined(ARM_COMPUTE_ENABLE_SVE) + { + "sve_fp16_activation", + [](const ActivationSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, + REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_activation) + }, + { + "sve_fp32_activation", + [](const ActivationSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, + REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_activation) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ +#if defined(ARM_COMPUTE_ENABLE_NEON) + { + "neon_fp16_activation", + [](const ActivationSelectorData & data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_activation) + }, + { + "neon_fp32_activation", + [](const ActivationSelectorData & data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_activation) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ +#if defined(ARM_COMPUTE_ENABLE_SVE2) + { + "sve_qu8_activation", + [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); }, + REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_activation) + }, + { + "sve_qs8_activation", + [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); }, + REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_activation) + }, + { + "sve_qs16_activation", + [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16 && data.ci.has_sve2(); }, + REGISTER_QSYMM16_SVE(arm_compute::cpu::qsymm16_sve_activation) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ + { + "neon_qu8_activation", + [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_activation) + }, + { + "neon_qs8_activation", + [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_activation) + }, + { + "neon_qs16_activation", + [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; }, + REGISTER_QSYMM16_NEON(arm_compute::cpu::qsymm16_neon_activation) + }, +}; + +const ActivationKernel *get_implementation(const ActivationSelectorData &data) +{ + for(const auto &uk : available_kernels) + { + if(uk.is_selected(data)) + { + return &uk; + } + } + return nullptr; +} + +/* Supported activation in the 8-bit integer domain */ +static const std::array<ActivationLayerInfo::ActivationFunction, 7> qasymm8_activations = +{ + ActivationLayerInfo::ActivationFunction::RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LOGISTIC, + ActivationLayerInfo::ActivationFunction::TANH, + ActivationLayerInfo::ActivationFunction::HARD_SWISH, + ActivationLayerInfo::ActivationFunction::LEAKY_RELU, +}; +/* Supported activation in the 16-bit integer domain */ +static const std::array<ActivationLayerInfo::ActivationFunction, 3> qsymm16_activations = +{ + ActivationLayerInfo::ActivationFunction::LOGISTIC, + ActivationLayerInfo::ActivationFunction::TANH, + ActivationLayerInfo::ActivationFunction::HARD_SWISH +}; + +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &activation_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32); + + const auto *uk = get_implementation(ActivationSelectorData{ src->data_type(), CPUInfo::get() }); + ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + const DataType data_type = src->data_type(); + const QuantizationInfo &oq_info = (dst != nullptr) ? dst->quantization_info() : src->quantization_info(); + const ActivationLayerInfo::ActivationFunction f_act = activation_info.activation(); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_asymmetric(data_type) && (std::find(std::begin(qasymm8_activations), std::end(qasymm8_activations), f_act) == std::end(qasymm8_activations)), + "For QASYMM8 only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) && (std::find(std::begin(qsymm16_activations), std::end(qsymm16_activations), f_act) == std::end(qsymm16_activations)), + "For QSYMM16 only tanh and logistic are supported"); + ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) + && (oq_info != QuantizationInfo(1.f / 128.f, 128))); + ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + && (oq_info != QuantizationInfo(1.f / 256.f, 0))); + + ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0))); + ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128))); + + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0))); + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0))); + + // Checks performed when dst is configured + if((dst != nullptr) && (dst->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst) +{ + // Configure kernel window + Window win = calculate_max_window(*src, Steps()); + + if(dst != nullptr) + { + // dst auto inizialitation if not yet initialized + auto_init_if_empty(*dst, *src->clone()); + } + + return std::make_pair(Status{}, win); +} +} // namespace + +void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, activation_info)); + + const auto uk = get_implementation(ActivationSelectorData{ src->data_type(), CPUInfo::get() }); + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + + _act_info = activation_info; + _run_method = uk->ukernel; + _name = std::string("CpuActivationKernel").append("/").append(uk->name); + + // Configure kernel window + auto win_config = validate_and_configure_window(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICPPKernel::configure(win_config.second); +} + +Status CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(act_info); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), (dst != nullptr) ? dst->clone().get() : nullptr).first); + + return Status{}; +} + +void CpuActivationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + // Early exit on disabled activation + if(!_act_info.enabled()) + { + return; + } + + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + ARM_COMPUTE_ERROR_ON(tensors.empty()); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); + + const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); + + _run_method(src, dst, _act_info, window); +} + +const char *CpuActivationKernel::name() const +{ + return _name.c_str(); +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuActivationKernel.h b/src/cpu/kernels/CpuActivationKernel.h new file mode 100644 index 0000000000..43c266529f --- /dev/null +++ b/src/cpu/kernels/CpuActivationKernel.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H +#define ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the activation kernel */ +class CpuActivationKernel : public ICpuKernel +{ +public: + CpuActivationKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuActivationKernel); + /** Configure kernel for a given list of arguments + * + * @note If the output tensor is a nullptr, the activation function will be performed in-place + * + * @param[in, out] src Source tensor info. In case of @p dst tensor = nullptr, this tensor will store the result + * of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32. + * @param[out] dst Destination tensor info. Data type supported: same as @p src + * @param[in] activation_info Activation layer information. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo activation_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuActivationKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + using ActivationKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type; + +private: + ActivationLayerInfo _act_info{}; + ActivationKernelPtr _run_method{ nullptr }; + std::string _name{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H */ diff --git a/src/cpu/kernels/CpuAddKernel.cpp b/src/cpu/kernels/CpuAddKernel.cpp new file mode 100644 index 0000000000..07c9a65e55 --- /dev/null +++ b/src/cpu/kernels/CpuAddKernel.cpp @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuAddKernel.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "src/core/CPP/Validate.h" +#include "src/core/common/Registrars.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/add/neon/list.h" +#include "src/cpu/kernels/add/sve/list.h" + +#include <array> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +struct AddSelectorData +{ + DataType dt; + const CPUInfo &ci; +}; + +using AddSelectorPtr = std::add_pointer<bool(const AddSelectorData &data)>::type; +using AddKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type; +struct AddKernel +{ + const char *name; + const AddSelectorPtr is_selected; + AddKernelPtr ukernel; +}; + +static const AddKernel available_kernels[] = +{ +#if defined(ARM_COMPUTE_ENABLE_SVE2) + { + "sve2_qu8_add", + [](const AddSelectorData & data) + { + return (data.dt == DataType::QASYMM8) && data.ci.has_sve(); + }, + REGISTER_QASYMM8_SVE(arm_compute::cpu::add_qasymm8_sve) + }, + { + "sve2_qs8_add", + [](const AddSelectorData & data) + { + return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve(); + }, + REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::add_qasymm8_signed_sve) + }, + { + "sve2_qs16_add", + [](const AddSelectorData & data) + { + return (data.dt == DataType::QSYMM16) && data.ci.has_sve(); + }, + REGISTER_QSYMM16_SVE(arm_compute::cpu::add_qsymm16_sve) + }, +#endif /* !defined(ARM_COMPUTE_ENABLE_SVE2) */ +#if defined(ARM_COMPUTE_ENABLE_SVE) + { + "sve_fp32_add", + [](const AddSelectorData & data) + { + return (data.dt == DataType::F32) && data.ci.has_sve(); + }, + REGISTER_FP32_SVE(arm_compute::cpu::add_same_sve<float>) + }, + { + "sve_fp16_add", + [](const AddSelectorData & data) + { + return (data.dt == DataType::F16) && data.ci.has_sve(); + }, + REGISTER_FP16_SVE(arm_compute::cpu::add_same_sve<float16_t>) + }, + { + "sve_u8_add", + [](const AddSelectorData & data) + { + return (data.dt == DataType::U8) && data.ci.has_sve(); + }, + REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<uint8_t>) + }, + { + "sve_s16_add", + [](const AddSelectorData & data) + { + return (data.dt == DataType::S16) && data.ci.has_sve(); + }, + REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<int16_t>) + }, + { + "sve_s32_add", + [](const AddSelectorData & data) + { + return (data.dt == DataType::S32) && data.ci.has_sve(); + }, + REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<int32_t>) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ +#if defined(ARM_COMPUTE_ENABLE_NEON) + { + "neon_fp32_add", + [](const AddSelectorData & data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(arm_compute::cpu::add_same_neon<float>) + }, +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + { + "neon_fp16_add", + [](const AddSelectorData & data) + { + return (data.dt == DataType::F16) && data.ci.has_fp16(); + }, + REGISTER_FP16_NEON(arm_compute::cpu::add_same_neon<float16_t>) + }, +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ + { + "neon_u8_add", + [](const AddSelectorData & data) { return (data.dt == DataType::U8); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<uint8_t>) + }, + { + "neon_s16_add", + [](const AddSelectorData & data) { return (data.dt == DataType::S16); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<int16_t>) + }, + { + "neon_s32_add", + [](const AddSelectorData & data) { return (data.dt == DataType::S32); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<int32_t>) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ +#if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) + { + "neon_qu8_add", + [](const AddSelectorData & data) { return (data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon) + }, + { + "neon_qs8_add", + [](const AddSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon) + }, + { + "neon_qs16_add", + [](const AddSelectorData & data) { return (data.dt == DataType::QSYMM16); }, + REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */ +}; + +/** Micro-kernel selector + * + * @param[in] data Selection data passed to help pick the appropriate micro-kernel + * + * @return A matching micro-kernel else nullptr + */ +const AddKernel *get_implementation(const CPUInfo &cpuinfo, DataType dt) +{ + for(const auto &uk : available_kernels) + { + if(uk.is_selected({ dt, cpuinfo })) + { + return &uk; + } + } + return nullptr; +} + +Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy) +{ + ARM_COMPUTE_UNUSED(policy); + + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S16, DataType::QSYMM16, DataType::F16, + DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1); + + const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src0.tensor_shape().x() != src1.tensor_shape().x()) && ((src0.data_type() != src1.data_type()) || (src0.data_type() != dst.data_type()) + || (src1.data_type() != dst.data_type())), + "Broadcasting across width is supported on configurations where all tensors have the same data type"); + + // Validate in case of configured dst + if(dst.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), + "Wrong shape for dst"); + } + + const auto *uk = get_implementation(CPUInfo::get(), src0.data_type()); + ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo &src0, const ITensorInfo &src1, ITensorInfo &dst) +{ + const TensorShape &out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape()); + + // Auto initialize dst if not initialized + set_shape_if_empty(dst, out_shape); + set_data_type_if_unknown(dst, src0.data_type()); + + Window win = calculate_max_window(out_shape, Steps()); + + // CpuAddKernel doesn't need padding so update_window_and_padding() can be skipped + return std::make_pair(Status{}, win); +} +} // namespace + +void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy)); + + const auto uk = get_implementation(CPUInfo::get(), src0->data_type()); + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + + _policy = policy; + _run_method = uk->ukernel; + _name = std::string("CpuAddKernel").append("/").append(uk->name); + + // Configure kernel window + auto win_config = validate_and_configure_window(*src0, *src1, *dst); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICpuKernel::configure(win_config.second); +} + +Status CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*src0->clone(), *src1->clone(), *dst->clone()).first); + + return Status{}; +} + +void CpuAddKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + ARM_COMPUTE_ERROR_ON(tensors.empty()); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); + + const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); + + _run_method(src0, src1, dst, _policy, window); +} + +const char *CpuAddKernel::name() const +{ + return _name.c_str(); +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuAddKernel.h b/src/cpu/kernels/CpuAddKernel.h new file mode 100644 index 0000000000..11c0f67132 --- /dev/null +++ b/src/cpu/kernels/CpuAddKernel.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_ADD_KERNEL_H +#define ARM_COMPUTE_CPU_ADD_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the kernel to perform addition between two tensors */ +class CpuAddKernel : public ICpuKernel +{ +public: + CpuAddKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuAddKernel); + /** Initialise the kernel's input, dst and border mode. + * + * Valid configurations (src0,src1) -> dst : + * + * - (U8,U8) -> U8 + * - (S16,S16) -> S16 + * - (S32,S32) -> S32 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * - (QASYMM8,QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (QSYMM16,QSYMM16) -> QSYMM16 + * + * @param[in] src0 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 + * @param[in] src1 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 + * @param[out] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. + * @param[in] policy Overflow policy. + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuAddKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + using AddKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type; + +private: + ConvertPolicy _policy{}; + AddKernelPtr _run_method{ nullptr }; + std::string _name{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_ADD_KERNEL_H */ diff --git a/src/cpu/kernels/CpuCastKernel.cpp b/src/cpu/kernels/CpuCastKernel.cpp new file mode 100644 index 0000000000..db76df9076 --- /dev/null +++ b/src/cpu/kernels/CpuCastKernel.cpp @@ -0,0 +1,1367 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuCastKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/SaturateCast.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(dst); + ARM_COMPUTE_UNUSED(policy); + ARM_COMPUTE_RETURN_ERROR_ON(src == dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, + DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16, + DataType::F32, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8, + DataType::S16, DataType::U16, DataType::BFLOAT16, DataType::F16, + DataType::U32, DataType::S32, DataType::F32); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32 + && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32), + "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 + && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32), + "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 + && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32), + "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 && (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32), + "Only data_types supported [in] U16 -> [out] U8, U32"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32), + "Only data_types supported [in] S16 -> [out] U8, S32"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::BFLOAT16 && dst->data_type() != DataType::F32, + "Only data_types supported [in] BFLOAT16 -> [out] F32"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8 + && dst->data_type() != DataType::U8 + && dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32), + "Only data_types supported [in] F16 -> [out] QASYMM8, F32, S32, U8"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8 + && dst->data_type() != DataType::F16 && dst->data_type() != DataType::BFLOAT16 + && dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8), + "Only data_types supported [in] F32 -> [out] QASYMM8, BFLOAT16, F16, S32, U8"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8 + && dst->data_type() != DataType::F16 + && dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8), + "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8"); + + // Validate in case of configured dst + if(dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + + return Status{}; +} +} // namespace + +void CpuCastKernel::configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Auto initialize dst shape if not initialized (We can only auto-configure the shape, datatype must be given) + set_shape_if_empty(*dst, src->tensor_shape()); + + _policy = policy; + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy)); + + // Configure kernel window + Window win = calculate_max_window(*src, Steps()); + + ICPPKernel::configure(win); +} + +Status CpuCastKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, policy)); + return Status{}; +} + +void CpuCastKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const int window_step_x = 16; + + const ITensor *_src = tensors.get_const_tensor(TensorType::ACL_SRC); + ITensor *_dst = tensors.get_tensor(TensorType::ACL_DST); + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); + ARM_COMPUTE_ERROR_ON(_src == _dst); + + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst); + + Window win{ window }; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator src(_src, win); + Iterator dst(_dst, win); + + switch(_src->info()->data_type()) + { + case DataType::QASYMM8_SIGNED: + { + switch(_dst->info()->data_type()) + { + case DataType::S16: + { + /* Up-conversion QASYMM8_SIGNED -> S16 */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr()); + int x = window_start_x; + + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); + + const int16x8x2_t texels = + { + { + vmovl_s8(vget_low_s8(texels_s8)), + vmovl_s8(vget_high_s8(texels_s8)) + } + }; + + vst1q_s16(dst_ptr + x, texels.val[0]); + vst1q_s16(dst_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + case DataType::S32: + { + /* Up-conversion QASYMM8_SIGNED -> S32 */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + int x = window_start_x; + + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); + + const int16x8x2_t texels = + { + { + vmovl_s8(vget_low_s8(texels_s8)), + vmovl_s8(vget_high_s8(texels_s8)) + } + }; + + vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); + vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); + vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); + vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + case DataType::F32: + { + /* Up-conversion QASYMM8_SIGNED -> F32 */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast<int8_t *>(src.ptr())); + + const int16x8x2_t texels = + { + { + vmovl_s8(vget_low_s8(texels_s8)), + vmovl_s8(vget_high_s8(texels_s8)) + } + }; + vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); + vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); + vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); + vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float>(*(src_ptr + x)); + } + }, + src, dst); + break; + } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + { + /* Up-conversion QASYMM8_SIGNED -> F16 */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); + int x = window_start_x; + + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int8x16_t texels_s8 = vld1q_s8(src_ptr + x); + + const int16x8x2_t texels = + { + { + vmovl_s8(vget_low_s8(texels_s8)), + vmovl_s8(vget_high_s8(texels_s8)) + } + }; + vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0])); + vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + + default: + ARM_COMPUTE_ERROR("dst data type not supported"); + } + break; + } + + case DataType::QASYMM8: + case DataType::U8: + { + switch(_dst->info()->data_type()) + { + case DataType::S16: + { + /* Up-conversion U8 -> S16 */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + + const int16x8x2_t texels = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))) + } + }; + + vst1q_s16(dst_ptr + x, texels.val[0]); + vst1q_s16(dst_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + case DataType::S32: + { + /* Up-conversion U8 -> S32 */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + + const int16x8x2_t texels = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))) + } + }; + + vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0]))); + vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0]))); + vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1]))); + vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1]))); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + case DataType::F32: + { + /* Up-conversion U8 -> F32 */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + + const int16x8x2_t texels = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))) + } + }; + vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0])))); + vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0])))); + vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1])))); + vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1])))); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + { + /* Up-conversion U8 -> F16 */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + + const int16x8x2_t texels = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))) + } + }; + vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0])); + vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1])); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::U16: + { + /* Up-conversion U8 -> U16 */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x); + + const uint16x8x2_t texels = + { + { + vmovl_u8(vget_low_u8(texels_u8)), + vmovl_u8(vget_high_u8(texels_u8)) + } + }; + + vst1q_u16(dst_ptr + x, texels.val[0]); + vst1q_u16(dst_ptr + x + 8, texels.val[1]); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + default: + ARM_COMPUTE_ERROR("dst data type not supported"); + } + break; + } + case DataType::S16: + { + switch(_dst->info()->data_type()) + { + case DataType::QASYMM8_SIGNED: + { + /* Down-conversion S16 -> QASYMM8_SIGNED */ + if(ConvertPolicy::SATURATE == _policy) + { + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8x2_t texels = + { + { + vld1q_s16(src_ptr + x), + vld1q_s16(src_ptr + x + 8) + } + }; + + vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1]))); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); + } + else + { + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8x2_t texels = + { + { + vld1q_s16(src_ptr + x), + vld1q_s16(src_ptr + x + 8) + } + }; + + vst1q_s8(dst_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1]))); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); + } + break; + } + case DataType::U8: + { + /* Down-conversion S16 -> U8 */ + if(ConvertPolicy::SATURATE == _policy) + { + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8x2_t texels = + { + { + vld1q_s16(src_ptr + x), + vld1q_s16(src_ptr + x + 8) + } + }; + + vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1]))); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); + } + else + { + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8x2_t texels = + { + { + vld1q_s16(src_ptr + x), + vld1q_s16(src_ptr + x + 8) + } + }; + + vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])), + vmovn_u16(vreinterpretq_u16_s16(texels.val[1])))); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); + } + break; + } + case DataType::S32: + { + /* Up-conversion S16 -> S32 */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8x2_t texels = + { + { + vld1q_s16(src_ptr + x), + vld1q_s16(src_ptr + x + 8) + } + }; + + const int32x4x4_t texels_s32 = + { + { + vmovl_s16(vget_low_s16(texels.val[0])), + vmovl_s16(vget_high_s16(texels.val[0])), + vmovl_s16(vget_low_s16(texels.val[1])), + vmovl_s16(vget_high_s16(texels.val[1])) + } + }; + + vst1q_s32(dst_ptr + x, texels_s32.val[0]); + vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]); + vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]); + vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + default: + ARM_COMPUTE_ERROR("dst data type not supported"); + } + break; + } + case DataType::U16: + { + switch(_dst->info()->data_type()) + { + case DataType::U8: + { + /* Down-conversion U16 -> U8 */ + if(ConvertPolicy::SATURATE == _policy) + { + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint16x8x2_t texels = + { + { + vld1q_u16(src_ptr + x), + vld1q_u16(src_ptr + x + 8) + } + }; + + vst1q_u8(dst_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1]))); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); + } + else + { + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint16x8x2_t texels = + { + { + vld1q_u16(src_ptr + x), + vld1q_u16(src_ptr + x + 8) + } + }; + + vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1]))); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x)); + } + + }, + src, dst); + } + break; + } + case DataType::U32: + { + /* Up-conversion U16 -> U32 */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint16x8x2_t texels = + { + { + vld1q_u16(src_ptr + x), + vld1q_u16(src_ptr + x + 8) + } + }; + + vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0]))); + vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0]))); + vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1]))); + vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1]))); + } + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x)); + } + + }, + src, dst); + break; + } + default: + ARM_COMPUTE_ERROR("dst data type not supported"); + } + break; + } +#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) + case DataType::BFLOAT16: + switch(_dst->info()->data_type()) + { + case DataType::F32: + { + /* Up-conversion BFLOAT16 -> F32 */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const bfloat16 *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint16x8x2_t texels = + { + { + vld1q_u16(reinterpret_cast<uint16_t *>(src.ptr())), + vld1q_u16(reinterpret_cast<uint16_t *>(src.ptr()) + 8) + } + }; + + vst1q_f32(reinterpret_cast<float *>(dst.ptr()), + vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[0])), 16))); + vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + 4, + vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[0])), 16))); + vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + 8, + vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[1])), 16))); + vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + 12, + vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[1])), 16))); + } + + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = float(*(src_ptr + x)); + } + }, + src, dst); + break; + } + default: + ARM_COMPUTE_ERROR("dst data type unsupported"); + } + break; +#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */ +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + switch(_dst->info()->data_type()) + { + case DataType::QASYMM8_SIGNED: + { + /* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float16x8x2_t texels = + { + { + vld1q_f16(src_ptr + x), + vld1q_f16(src_ptr + x + 8), + } + }; + + vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), vqmovn_s16(vcvtq_s16_f16(texels.val[1])))); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + case DataType::QASYMM8: + case DataType::U8: + { + /* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float16x8x2_t texels = + { + { + vld1q_f16(src_ptr + x), + vld1q_f16(src_ptr + x + 8), + } + }; + + vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1])))); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); + } + + }, + src, dst); + break; + } + case DataType::F32: + { + /* Up-conversion F16 -> F32 */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float16x8x2_t texels = + { + { + vld1q_f16(src_ptr + x), + vld1q_f16(src_ptr + x + 8) + } + }; + vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0]))); + vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0]))); + vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1]))); + vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1]))); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + case DataType::S32: + { + /* Up-conversion F16 -> S32 */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float16x8x2_t texels = + { + { + vld1q_f16(src_ptr + x), + vld1q_f16(src_ptr + x + 8) + } + }; + + vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])))); + vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])))); + vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])))); + vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])))); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + default: + ARM_COMPUTE_ERROR("dst data type not supported"); + } + break; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + case DataType::F32: + switch(_dst->info()->data_type()) + { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + { + /* Down-conversion F32 -> F16 */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t texels = + { + { + vld1q_f32(src_ptr + x), + vld1q_f32(src_ptr + x + 4), + vld1q_f32(src_ptr + x + 8), + vld1q_f32(src_ptr + x + 12) + } + }; + + vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1]))); + vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3]))); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) + case DataType::BFLOAT16: + { + /* Down-conversion F32 -> BFLOAT16 */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<bfloat16 *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(src.ptr()), + reinterpret_cast<uint16_t *>(dst.ptr())); + wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(src.ptr()) + 8, + reinterpret_cast<uint16_t *>(dst.ptr()) + 8); + } + + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = *(src_ptr + x); + } + }, + src, dst); + break; + } +#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */ + case DataType::S32: + { + /* Conversion F32 -> S32 */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t texels = + { + { + vld1q_f32(src_ptr + x), + vld1q_f32(src_ptr + x + 4), + vld1q_f32(src_ptr + x + 8), + vld1q_f32(src_ptr + x + 12), + } + }; + + vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0])); + vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1])); + vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2])); + vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3])); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + case DataType::QASYMM8: + case DataType::U8: + { + /* Down-conversion F32 -> U8 */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t texels = + { + { + vld1q_f32(src_ptr + x), + vld1q_f32(src_ptr + x + 4), + vld1q_f32(src_ptr + x + 8), + vld1q_f32(src_ptr + x + 12), + } + }; + + vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1]))))); + vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3]))))); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + case DataType::QASYMM8_SIGNED: + { + /* Down-conversion F32 -> QASYMM8_SIGNED */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const float *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t texels = + { + { + vld1q_f32(src_ptr + x), + vld1q_f32(src_ptr + x + 4), + vld1q_f32(src_ptr + x + 8), + vld1q_f32(src_ptr + x + 12), + } + }; + + vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1]))))); + vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3]))))); + } + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + + default: + ARM_COMPUTE_ERROR("dst data type not supported"); + } + break; + + case DataType::S32: + switch(_dst->info()->data_type()) + { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + { + /* Down-conversion S32 -> F16 */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t texels = + { + { + vcvtq_f32_s32(vld1q_s32(src_ptr + x)), + vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)), + vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)), + vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12)) + } + }; + + vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1]))); + vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3]))); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x)); + } + }, + src, dst); + break; + } +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + case DataType::F32: + { + /* Conversion S32 -> F32 */ + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<float *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x4_t texels = + { + { + vld1q_s32(src_ptr + x), + vld1q_s32(src_ptr + x + 4), + vld1q_s32(src_ptr + x + 8), + vld1q_s32(src_ptr + x + 12), + } + }; + + vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0])); + vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1])); + vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2])); + vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3])); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<float>(*(src_ptr + x)); + } + }, + src, dst); + break; + } + case DataType::QASYMM8_SIGNED: + { + /* Down-conversion S32 -> QASYMM8_SIGNED */ + if(ConvertPolicy::SATURATE == _policy) + { + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x4_t texels = + { + { + vld1q_s32(src_ptr + x), + vld1q_s32(src_ptr + x + 4), + vld1q_s32(src_ptr + x + 8), + vld1q_s32(src_ptr + x + 12), + } + }; + vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1])))); + vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3])))); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); + } + else + { + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x4_t texels = + { + { + vld1q_s32(src_ptr + x), + vld1q_s32(src_ptr + x + 4), + vld1q_s32(src_ptr + x + 8), + vld1q_s32(src_ptr + x + 12) + } + }; + + vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1])))); + vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3])))); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x)); + } + }, + src, dst); + } + break; + } + case DataType::QASYMM8: + case DataType::U8: + { + /* Down-conversion S32 -> U8 */ + if(ConvertPolicy::SATURATE == _policy) + { + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x4_t texels = + { + { + vld1q_s32(src_ptr + x), + vld1q_s32(src_ptr + x + 4), + vld1q_s32(src_ptr + x + 8), + vld1q_s32(src_ptr + x + 12) + } + }; + vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1])))); + vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3])))); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); + } + else + { + execute_window_loop(win, [&](const Coordinates &) + { + const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr()); + const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x4_t texels = + { + { + vld1q_s32(src_ptr + x), + vld1q_s32(src_ptr + x + 4), + vld1q_s32(src_ptr + x + 8), + vld1q_s32(src_ptr + x + 12) + } + }; + + vst1_u8(dst_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1]))))); + vst1_u8(dst_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3]))))); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x)); + } + }, + src, dst); + } + break; + } + default: + ARM_COMPUTE_ERROR("dst data type not supported"); + } + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } +} + +const char *CpuCastKernel::name() const +{ + return "CpuCastKernel.cpp"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuCastKernel.h b/src/cpu/kernels/CpuCastKernel.h new file mode 100644 index 0000000000..a8ce97230e --- /dev/null +++ b/src/cpu/kernels/CpuCastKernel.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_CAST_KERNEL_H +#define ARM_COMPUTE_CPU_CAST_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Casts a given tensor to a new type + * + * @note When casting between quantized types the scale and zeroPoint are ignored + */ +class CpuCastKernel : public ICpuKernel +{ +public: + CpuCastKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuCastKernel); + /** Set the src and dst of the kernel + * + * Valid conversions src -> dst : + * + * - QASYMM8_SIGNED -> S16, S32, F32, F16 + * - QASYMM8 -> U16, S16, S32, F32, F16 + * - U8 -> U16, S16, S32, F32, F16 + * - U16 -> U8, U32 + * - S16 -> QASYMM8_SIGNED, U8, S32 + * - BFLOAT16 -> F32 + * - F16 -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8 + * - S32 -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8 + * - F32 -> QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8 + * + * @param[in] src The src tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/BFLOAT16/F16/F32. + * @param[out] dst The dst tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/BFLOAT16/F16/F32. + * @param[in] policy Conversion policy. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuCastKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + ConvertPolicy _policy{ ConvertPolicy::SATURATE }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_CAST_KERNEL_H */ diff --git a/src/cpu/kernels/CpuCol2ImKernel.cpp b/src/cpu/kernels/CpuCol2ImKernel.cpp new file mode 100644 index 0000000000..bf5a44d78b --- /dev/null +++ b/src/cpu/kernels/CpuCol2ImKernel.cpp @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuCol2ImKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +using namespace misc::shape_calculator; +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims) +{ + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions. + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + + // Validate configured output + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, false)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + } + + return Status{}; +} +} // namespace + +void CpuCol2ImKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, convolved_dims)); + + _convolved_dims = convolved_dims; + + // Configure kernel window + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_col2im_shape(*src, convolved_dims, false))); + + // Configure kernel window + Window win = calculate_max_window(*src, Steps()); + + ICpuKernel::configure(win); +} + +Status CpuCol2ImKernel::validate(const ITensorInfo *src, const ITensorInfo *output, const Size2D &convolved_dims) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, output, convolved_dims)); + return Status{}; +} + +void CpuCol2ImKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + const uint8_t el_size = src->info()->element_size(); + const int output_stride_x = dst->info()->strides_in_bytes().x(); + const int output_stride_y = dst->info()->strides_in_bytes().y(); + const int output_stride_z = dst->info()->strides_in_bytes().z(); + + Window window_out(window); + window_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Create iterators + Iterator in(src, window); + Iterator out(dst, window_out); + + execute_window_loop(window, [&](const Coordinates & id) + { + const int hidx = id.y(); + const int idx = id.x() * output_stride_z + (hidx / _convolved_dims.width) * output_stride_y + (hidx % _convolved_dims.width) * output_stride_x; + std::memcpy(out.ptr() + idx, in.ptr(), el_size); + }, + in, out); +} + +const char *CpuCol2ImKernel::name() const +{ + return "CpuCol2ImKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/CpuCol2ImKernel.h b/src/cpu/kernels/CpuCol2ImKernel.h new file mode 100644 index 0000000000..8e09a2b689 --- /dev/null +++ b/src/cpu/kernels/CpuCol2ImKernel.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_COL2IM_KERNEL_H +#define ARM_COMPUTE_CPU_COL2IM_KERNEL_H + +#include "arm_compute/core/Size2D.h" +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel to perform col2im reshaping. + * + * Rearranges each matrix column into image blocks. It's the inverse operation of @ref CpuIm2ColKernel. + * + * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3: + * + * @f[ + * \left( \begin{array}{ccccccccc} + * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{ccc} + * a0 & a1 & a2 \\ + * a3 & a4 & a5 \\ + * a6 & a7 & a8 \\ + * \end{array} \right) + * @f] + */ +class CpuCol2ImKernel : public ICpuKernel +{ +public: + /** Default constructor */ + CpuCol2ImKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuCol2ImKernel); + /** Set the input and output of the kernel. + * + * @param[in] src The input tensor info to convert. Data types supported: All + * @param[out] dst The output tensor info. 3 lower dimensions represent a single output [width, height, OFM], + * while the rest represent batch of outputs. Data types supported: Same as @p input + * @param[in] convolved_dims Output convolved dimensions. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuCol2ImKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + Size2D _convolved_dims{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /*ARM_COMPUTE_CPU_COL2IM_KERNEL_H */ diff --git a/src/cpu/kernels/CpuConcatenateBatchKernel.cpp b/src/cpu/kernels/CpuConcatenateBatchKernel.cpp new file mode 100644 index 0000000000..29d40f0e52 --- /dev/null +++ b/src/cpu/kernels/CpuConcatenateBatchKernel.cpp @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuConcatenateBatchKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +template <typename T> +void batch_concat(const ITensor *src, ITensor *dst, unsigned int batch_offset, const Window &window) +{ + // Offset src + uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); + + // Offset dst + uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + batch_offset * dst->info()->strides_in_bytes()[3]; + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const int window_step_x = 16 / dst->info()->element_size(); + + Window win{ window }; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + win.set(3, Window::Dimension(0, src->info()->tensor_shape()[3], 1)); + + Iterator src_it(src, win); + Iterator dst_it(dst, win); + + const DataType dt = src->info()->data_type(); + const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform(); + if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) + { + execute_window_loop(win, [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr, vquantize(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); + } + else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) + { + execute_window_loop(win, [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset()); + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo)); + } + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); + } + else + { + execute_window_loop(win, [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const T *>(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(out_ptr + x) = *(in_ptr + x); + } + }, + src_it, dst_it); + } +} + +Status validate_arguments(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX)); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY)); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimZ) != dst->dimension(Window::DimZ)); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(3) + batch_offset > dst->dimension(3)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(4, src, dst); + + return Status{}; +} +} // namespace + +void CpuConcatenateBatchKernel::configure(const ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, batch_offset, dst)); + + _func = nullptr; + _batch_offset = batch_offset; + + switch(src->data_type()) + { + case DataType::S8: + case DataType::U8: + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + _func = &batch_concat<uint8_t>; + break; + case DataType::S16: + case DataType::U16: + case DataType::F16: + _func = &batch_concat<uint16_t>; + break; + case DataType::S32: + case DataType::U32: + case DataType::F32: + _func = &batch_concat<uint32_t>; + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + + // Configure kernel window + Window win = calculate_max_window(*dst, Steps()); + ICpuKernel::configure(win); +} + +Status CpuConcatenateBatchKernel::validate(const arm_compute::ITensorInfo *src, + unsigned int batch_offset, + const arm_compute::ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, batch_offset, dst)); + return Status{}; +} + +void CpuConcatenateBatchKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), + tensors.get_tensor(TensorType::ACL_DST), + _batch_offset, + window); +} + +const char *CpuConcatenateBatchKernel::name() const +{ + return "CpuConcatenateBatchKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuConcatenateBatchKernel.h b/src/cpu/kernels/CpuConcatenateBatchKernel.h new file mode 100644 index 0000000000..91f2808f81 --- /dev/null +++ b/src/cpu/kernels/CpuConcatenateBatchKernel.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_CONCATENATE_BATCH_KERNEL_H +#define ARM_COMPUTE_CPU_CONCATENATE_BATCH_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the batch concatenate kernel. + * The input tensor will be concatenated into the output tensor. + */ +class CpuConcatenateBatchKernel : public ICpuKernel +{ +public: + CpuConcatenateBatchKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateBatchKernel); + /** Configure kernel for a given list of arguments + * + * @param[in] src Source tensor info. Data types supported: All. + * @param[in] batch_offset The offset on axis # 3. + * @param[in,out] dst Destination tensor info. Data types supported: Same as @p src. + */ + void configure(const ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuConcatenateBatchKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + using BatchConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &); + +private: + BatchConcatFunction *_func{ nullptr }; + unsigned int _batch_offset{ 0 }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_CONCATENATE_BATCH_KERNEL_H */ diff --git a/src/cpu/kernels/CpuConcatenateDepthKernel.cpp b/src/cpu/kernels/CpuConcatenateDepthKernel.cpp new file mode 100644 index 0000000000..ebc5322aee --- /dev/null +++ b/src/cpu/kernels/CpuConcatenateDepthKernel.cpp @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuConcatenateDepthKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <cstdint> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +template <typename T> +void depth_concat(const ITensor *src, ITensor *dst, unsigned int depth_offset, const Window &window) +{ + // Offset source + uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); + + // Offset destination + uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + depth_offset * dst->info()->strides_in_bytes()[2]; + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const int window_step_x = 16 / dst->info()->element_size(); + + Window win{ window }; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + win.set(Window::DimZ, Window::Dimension(0, src->info()->tensor_shape().z(), 1)); + + Iterator src_it(src, win); + Iterator dst_it(dst, win); + + const DataType dt = src->info()->data_type(); + const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform(); + if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) + { + execute_window_loop(win, [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset()); + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, vquantize(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); + } + else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) + { + execute_window_loop(win, [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset()); + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); + } + else + { + execute_window_loop(win, [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const T *>(src_ptr + src_it.offset()); + const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset()); + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); + } + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(out_ptr + x) = *(in_ptr + x); + } + }, + src_it, dst_it); + } +} + +Status validate_arguments(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions. + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX)); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimY) != output->dimension(Window::DimY)); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) + depth_offset > output->dimension(2)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(3, input, output); + + return Status{}; +} +} // namespace + +void CpuConcatenateDepthKernel::configure(const ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, depth_offset, dst)); + + _func = nullptr; + _depth_offset = depth_offset; + + switch(src->data_type()) + { + case DataType::QASYMM8: + _func = &depth_concat<uint8_t>; + break; + case DataType::QASYMM8_SIGNED: + _func = &depth_concat<int8_t>; + break; + case DataType::F16: + _func = &depth_concat<uint16_t>; + break; + case DataType::F32: + _func = &depth_concat<uint32_t>; + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + + // Configure kernel window + Window win = calculate_max_window(*dst, Steps()); + ICpuKernel::configure(win); +} + +Status CpuConcatenateDepthKernel::validate(const arm_compute::ITensorInfo *src, + unsigned int depth_offset, + const arm_compute::ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, depth_offset, dst)); + return Status{}; +} + +void CpuConcatenateDepthKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), + tensors.get_tensor(TensorType::ACL_DST), + _depth_offset, + window); +} + +const char *CpuConcatenateDepthKernel::name() const +{ + return "CpuConcatenateDepthKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuConcatenateDepthKernel.h b/src/cpu/kernels/CpuConcatenateDepthKernel.h new file mode 100644 index 0000000000..063118b33b --- /dev/null +++ b/src/cpu/kernels/CpuConcatenateDepthKernel.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ARM_COMPUTE_CPU_CONCATENATE_DEPTH_KERNEL_H +#define ARM_COMPUTE_CPU_CONCATENATE_DEPTH_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +// Forward declarations +class ITensor; + +namespace cpu +{ +namespace kernels +{ +/** Interface for the depth concatenate kernel. + * The input tensor will be concatenated into the output tensor. + */ +class CpuConcatenateDepthKernel : public ICpuKernel +{ +public: + CpuConcatenateDepthKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateDepthKernel); + /** Configure kernel for a given list of arguments + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] depth_offset The offset on the Z axis. + * @param[in,out] dst Destination tensor info. Data types supported: Same as @p src. + * + * @note: The output tensor's low two dimensions can't be smaller than the input one's. + * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2. + * + */ + void configure(const ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuConcatenateDepthKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + using DepthConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &); + +private: + DepthConcatFunction *_func{ nullptr }; + unsigned int _depth_offset{ 0 }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_CONCATENATE_DEPTH_KERNEL_H */ diff --git a/src/cpu/kernels/CpuConcatenateHeightKernel.cpp b/src/cpu/kernels/CpuConcatenateHeightKernel.cpp new file mode 100644 index 0000000000..47a2b44443 --- /dev/null +++ b/src/cpu/kernels/CpuConcatenateHeightKernel.cpp @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuConcatenateHeightKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <cstdint> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX)); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY)); + for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i)); + } + + return Status{}; +} +} // namespace + +void CpuConcatenateHeightKernel::configure(const ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst) +{ + ARM_COMPUTE_UNUSED(src); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, height_offset, dst)); + + _height_offset = height_offset; + + // Configure kernel window + Window win = calculate_max_window(*dst, Steps()); + ICpuKernel::configure(win); +} + +Status CpuConcatenateHeightKernel::validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, height_offset, dst)); + return Status{}; +} + +void CpuConcatenateHeightKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + // Offset destination pointer to the correct position + uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _height_offset * dst->info()->strides_in_bytes()[Window::DimY]; + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size()); + const int window_step_x = 16; + + Window win{ window }; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + win.set(Window::DimY, Window::Dimension(0, src->info()->tensor_shape().y(), 1)); + + // Create iterators + Iterator src_it(src, win); + Iterator dst_it(dst, win); + + const DataType dt = src->info()->data_type(); + const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform(); + if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) + { + execute_window_loop(win, [&](const Coordinates &) + { + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + vst1q_u8(dst_ptr + dst_it.offset() + x, vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo); + } + + }, + src_it, dst_it); + } + else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) + { + execute_window_loop(win, [&](const Coordinates &) + { + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + vst1q_s8(reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x), + vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr()) + x), src_qinfo), dst_qinfo)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); + } + else + { + execute_window_loop(win, [&](const Coordinates &) + { + const auto in_ptr = src_it.ptr(); + const auto out_ptr = dst_ptr + dst_it.offset(); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(out_ptr + x) = *(in_ptr + x); + } + }, + src_it, dst_it); + } +} + +const char *CpuConcatenateHeightKernel::name() const +{ + return "CpuConcatenateHeightKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuConcatenateHeightKernel.h b/src/cpu/kernels/CpuConcatenateHeightKernel.h new file mode 100644 index 0000000000..883c59a206 --- /dev/null +++ b/src/cpu/kernels/CpuConcatenateHeightKernel.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_CONCATENATE_HEIGHT_KERNEL_H +#define ARM_COMPUTE_CPU_CONCATENATE_HEIGHT_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the height concatenate kernel. + * The source tensor will be concatenated into the destination tensor. + */ +class CpuConcatenateHeightKernel : public ICpuKernel +{ +public: + CpuConcatenateHeightKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateHeightKernel); + /** Configure kernel for a given list of arguments + * + * @param[in] src Source tensor info. Data types supported: All + * @param[in] height_offset The starting offset on the Y axis for the output tensor. + * @param[in,out] dst Destination tensor info. Data types supported: Same as @p src. + * + */ + void configure(const ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuConcatenateHeightKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + unsigned int _height_offset{ 0 }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_CONCATENATE_HEIGHT_KERNEL_H */ diff --git a/src/cpu/kernels/CpuConcatenateWidthKernel.cpp b/src/cpu/kernels/CpuConcatenateWidthKernel.cpp new file mode 100644 index 0000000000..90813ff7b4 --- /dev/null +++ b/src/cpu/kernels/CpuConcatenateWidthKernel.cpp @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuConcatenateWidthKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <cstdint> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0)); + + for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i)); + } + + return Status{}; +} +} // namespace + +void CpuConcatenateWidthKernel::configure(const ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, width_offset, dst)); + ARM_COMPUTE_UNUSED(dst); + + _width_offset = width_offset; + + // Configure kernel window + Window win = calculate_max_window(*src, Steps()); + + ICpuKernel::configure(win); +} + +Status CpuConcatenateWidthKernel::validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, width_offset, dst)); + return Status{}; +} + +void CpuConcatenateWidthKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + // Offset output pointer to the correct position + uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _width_offset * dst->info()->strides_in_bytes()[0]; + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size()); + constexpr int window_step_x = 16; + + Window win{ window }; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator src_it(src, win); + Iterator dst_it(dst, win); + const DataType dt = src->info()->data_type(); + const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform(); + if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo) + { + execute_window_loop(win, [&](const Coordinates &) + { + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + vst1q_u8(dst_ptr + dst_it.offset() + x, vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); + } + else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo) + { + execute_window_loop(win, [&](const Coordinates &) + { + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + vst1q_s8(reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x), + vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr() + x)), src_qinfo), dst_qinfo)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo); + } + }, + src_it, dst_it); + } + else + { + execute_window_loop(win, [&](const Coordinates &) + { + const auto in_ptr = src_it.ptr(); + const auto out_ptr = dst_ptr + dst_it.offset(); + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(out_ptr + x) = *(in_ptr + x); + } + }, + src_it, dst_it); + } +} + +const char *CpuConcatenateWidthKernel::name() const +{ + return "CpuConcatenateWidthKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuConcatenateWidthKernel.h b/src/cpu/kernels/CpuConcatenateWidthKernel.h new file mode 100644 index 0000000000..3b4612ab0d --- /dev/null +++ b/src/cpu/kernels/CpuConcatenateWidthKernel.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ARM_COMPUTE_CPU_CONCATENATE_WIDTH_KERNEL_H +#define ARM_COMPUTE_CPU_CONCATENATE_WIDTH_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the width concatenate kernel. + * The source tensor will be concatenated into the destination tensor. + */ +class CpuConcatenateWidthKernel : public ICPPKernel +{ +public: + CpuConcatenateWidthKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConcatenateWidthKernel); + /** Configure kernel for a given list of arguments + * + * @param[in] src Source tensor info. Data types supported: All + * @param[in] width_offset The offset on the X axis. + * @param[in,out] dst Destination tensor info. Data types supported: Same as @p src. + */ + void configure(const ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuConcatenateWidthKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + unsigned int _width_offset{ 0 }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_CONCATENATE_WIDTH_KERNEL_H */ diff --git a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp new file mode 100644 index 0000000000..08b39deef2 --- /dev/null +++ b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_input_shape, + DataLayout data_layout) + +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Output tensor auto initialisation if not yet initialized + auto_init_if_empty(*dst, *src->clone()); + + ARM_COMPUTE_ERROR_THROW_ON(CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_input_shape, data_layout)); + + const DataLayout input_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW; + + const int width_idx = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::WIDTH); + const int height_idx = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::HEIGHT); + const int channel_idx = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::CHANNEL); + + const unsigned int num_elems_per_input_plane = original_input_shape[width_idx] * original_input_shape[height_idx]; + const unsigned int num_channels = original_input_shape[channel_idx]; + + _factor1 = (data_layout == DataLayout::NCHW) ? num_elems_per_input_plane : num_channels; + _factor2 = (data_layout == DataLayout::NCHW) ? num_channels : num_elems_per_input_plane; + + // Configure kernel window + Window win = calculate_max_window(*src, Steps()); + ICpuKernel::configure(win); +} + +Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_input_shape, + DataLayout data_layout) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() != 2); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(1) != original_input_shape.total_size_lower(3)); + ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN); + + // Checks performed when dst is configured + if((dst != nullptr) && (dst->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + + return Status{}; +} + +void CpuConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + const unsigned int dst_stride_x = dst->info()->strides_in_bytes().x(); + const unsigned int dst_stride_y = dst->info()->strides_in_bytes().y(); + const unsigned int element_size = src->info()->element_size(); + + Iterator input(src, window); + Iterator output(dst, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + memcpy(output.ptr() + id.x() * dst_stride_x + (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y, input.ptr(), element_size); + }, + input); +} + +const char *CpuConvertFullyConnectedWeightsKernel::name() const +{ + return "CpuConvertFullyConnectedWeightsKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h new file mode 100644 index 0000000000..70f0a742f9 --- /dev/null +++ b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H +#define ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface to convert the 2D Fully Connected weights from NCHW to NHWC or vice versa. + * + * @note This function can be applied to the 2D weights used by a Fully Connected layer if: + * - It follows a Convolution layer + * - The data layout used by the network does not match the one the model has been trained in. + * + * @note This function assumes the weights are already reshaped (transposed) + */ +class CpuConvertFullyConnectedWeightsKernel : public ICpuKernel +{ +public: + CpuConvertFullyConnectedWeightsKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConvertFullyConnectedWeightsKernel); + /** Set the src and dst tensor. + * + * @param[in] src Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All. + * @param[in] dst The converted weights tensor info. Shape and Data Type: Same as @p src. + * @param[in] original_input_shape Shape of the original src tensor (the one entering fully connected layer). + * @param[in] data_layout The data layout the weights have been trained in. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_input_shape, DataLayout data_layout); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuConvertFullyConnectedWeightsKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_input_shape, DataLayout data_layout); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + unsigned int _factor1{ 0 }; /* equals to the number of elements per original src plane if @p data_layout == NCHW; its number of channels otherwise */ + unsigned int _factor2{ 0 }; /* equals to the number of elements per original src plane if @p data_layout == NHWC; its number of channels otherwise */ +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H */
\ No newline at end of file diff --git a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp new file mode 100644 index 0000000000..1005d001ab --- /dev/null +++ b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + + // Validate output if initialized + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape()); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst) +{ + // Output auto inizialitation if not yet initialized + { + const bool is_input_signed = src->data_type() == DataType::QASYMM8_SIGNED; + const DataType dt = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED; + const UniformQuantizationInfo qinfo = src->quantization_info().uniform(); + const int offset_correction = is_input_signed ? -128 : 128; + const QuantizationInfo corrected_qinfo = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction); + + auto_init_if_empty(*dst, src->clone()->set_data_type(dt).set_quantization_info(corrected_qinfo)); + } + + return std::make_pair(Status{}, calculate_max_window(*dst)); +} +} // namespace + +void CpuConvertQuantizedSignednessKernel::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); + + std::pair<Status, Window> win_config = validate_and_configure_window(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICpuKernel::configure(win_config.second); +} + +Status CpuConvertQuantizedSignednessKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst)); + return Status{}; +} + +void CpuConvertQuantizedSignednessKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + const uint8_t mask = 128; + const auto vmask = wrapper::vdup_n(mask, wrapper::traits::vector_128_tag{}); + + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(input_ptr + x); + wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const uint8_t in = *(reinterpret_cast<const uint8_t *>(input_ptr + x)); + *(output_ptr + x) = in ^ mask; + } + }, + input, output); +} + +const char *CpuConvertQuantizedSignednessKernel::name() const +{ + return "CpuConvertQuantizedSignednessKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h new file mode 100644 index 0000000000..8cce1eaf1d --- /dev/null +++ b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_CONVERTQUANTIZEDSIGNEDNESS_KERNEL_H +#define ARM_COMPUTE_CPU_CONVERTQUANTIZEDSIGNEDNESS_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel to convert asymmetric signed to asymmetric signed and vice-versa */ +class CpuConvertQuantizedSignednessKernel : public ICpuKernel +{ +public: + CpuConvertQuantizedSignednessKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConvertQuantizedSignednessKernel); + /** Initialize the kernel input and output info. + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED. + * @param[out] dst Destination tensor info. Data types supported: opposite of @p src. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuConvertQuantizedSignednessKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /*ARM_COMPUTE_CPU_CONVERTQUANTIZEDSIGNEDNESS_KERNEL_H */ diff --git a/src/cpu/kernels/CpuCopyKernel.cpp b/src/cpu/kernels/CpuCopyKernel.cpp new file mode 100644 index 0000000000..3f0f3fe422 --- /dev/null +++ b/src/cpu/kernels/CpuCopyKernel.cpp @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuCopyKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PaddingList &padding = PaddingList()) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 4); + + // Validate destination if initialized + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_padded_shape(src->tensor_shape(), padding), dst->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst) +{ + // Destination auto inizialitation if not yet initialized + auto_init_if_empty(*dst, *src); + return std::make_pair(Status{}, calculate_max_window(*dst)); +} + +std::pair<Status, Window> validate_and_configure_window_with_padding(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding) +{ + const TensorShape src_shape = src->tensor_shape(); + const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(src_shape, padding); + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(padded_shape)); + // Configure window + const Window win = calculate_max_window(*dst, dst->dimension(0)); + return std::make_pair(Status{}, win); +} + +} // namespace + +void CpuCopyKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, padding)); + + _padding = padding; + + std::pair<Status, Window> win_config; + if(padding.empty()) + { + win_config = validate_and_configure_window(src, dst); + } + else + { + win_config = validate_and_configure_window_with_padding(src, dst, padding); + } + + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICpuKernel::configure(win_config.second); +} + +Status CpuCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, const PaddingList &padding) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, padding)); + + if(padding.empty()) + { + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_with_padding(src->clone().get(), dst->clone().get(), padding).first); + } + + return Status{}; +} + +void CpuCopyKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + if(_padding.empty()) + { + Window dst_window{ window }; + dst_window.set(Window::DimX, Window::Dimension(dst_window.x().start(), dst_window.x().end(), src->info()->dimension(0))); + Window out_slice = dst_window.first_slice_window_1D(); + do + { + Iterator src_it(src, out_slice); + Iterator dst_it(dst, out_slice); + + execute_window_loop(out_slice, [&](const Coordinates &) + { + memcpy(dst_it.ptr(), src_it.ptr(), dst->info()->dimension(0) * dst->info()->element_size()); + }, + src_it, dst_it); + } + while(dst_window.slide_window_slice_1D(out_slice)); + } + else + { + Window src_window{ window }; + src_window.set(Window::DimX, Window::Dimension(0, window.x().end() - _padding[0].first, src->info()->dimension(0))); + + Iterator src_it(src, src_window); + Iterator dst_it(dst, window); + const size_t row_size_in_bytes = src->info()->dimension(0) * src->info()->element_size(); + execute_window_loop(window, [&](const Coordinates &) + { + auto dst_ptr = dst_it.ptr() + _padding[0].first * dst->info()->element_size(); + std::memcpy(dst_ptr, src_it.ptr(), row_size_in_bytes); + }, + src_it, dst_it); + } +} + +const char *CpuCopyKernel::name() const +{ + return "CpuCopyKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuCopyKernel.h b/src/cpu/kernels/CpuCopyKernel.h new file mode 100644 index 0000000000..193f38078b --- /dev/null +++ b/src/cpu/kernels/CpuCopyKernel.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_COPY_KERNEL_H +#define ARM_COMPUTE_CPU_COPY_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel to perform a copy between two tensors */ +class CpuCopyKernel : public ICpuKernel +{ +public: + CpuCopyKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuCopyKernel); + /** Configure kernel for a given list of arguments + * + * @param[in] src Source tensor. Data types supported: All + * @param[out] dst Destination tensor. Data types supported: same as @p src. + * @param[in] padding (Optional) Padding to be applied to the input tensor + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding = PaddingList()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuCopyKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PaddingList &padding = PaddingList()); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + PaddingList _padding{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_COPY_KERNEL_H */ diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp new file mode 100644 index 0000000000..d79fe87d1b --- /dev/null +++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp @@ -0,0 +1,950 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/wrapper/traits.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/ToolchainSupport.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +constexpr auto data_layout = DataLayout::NHWC; +const size_t width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); +const size_t height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); +const size_t channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + +constexpr auto dim_manual_loop = Window::Dimension(0, 0, 0); +constexpr auto dim_single_unit_step = Window::Dimension(0, 1, 1); +constexpr size_t vector_size = 8; + +struct DepthwiseConvolutionRunInfo +{ + const size_t num_read_elements_per_iteration; + const uint32_t x_start; + const uint32_t x_end; + const uint32_t x_step; + const uint32_t x_leftover_start; + const size_t input_stride_y; + const size_t input_stride_z; + const size_t input_max_offset; + const size_t weights_width; + const size_t weights_height; + const size_t weights_stride_y; + const size_t weights_stride_z; + const size_t conv_stride_x; + const size_t conv_stride_y; + const size_t conv_pad_left; + const size_t conv_pad_top; + const size_t input_height; + const size_t input_width; + const size_t input_depth; + + DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1) // NOLINT + : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)), + x_start(w.x().start()), + x_end(w.x().end()), + x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)), + x_leftover_start(std::max(static_cast<int32_t>(w.x().end()) - static_cast<int32_t>(x_step) + 1, int32_t(0))), + input_stride_y(input.strides_in_bytes().y()), + input_stride_z(input.strides_in_bytes().z()), + input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()), + weights_width(weights.dimension(width_idx)), + weights_height(weights.dimension(height_idx)), + weights_stride_y(weights.strides_in_bytes().y()), + weights_stride_z(weights.strides_in_bytes().z()), + conv_stride_x(conv_info.stride().first), + conv_stride_y(conv_info.stride().second), + conv_pad_left(conv_info.pad_left()), + conv_pad_top(conv_info.pad_top()), + input_height(input.dimension(height_idx)), + input_width(input.dimension(width_idx)), + input_depth(input.dimension(channel_idx)) + { + } +}; + +inline int32x4_t saturating_doubling_high_mul(const int32x4_t &a, const int32_t &b) +{ + return vqrdmulhq_n_s32(a, b); +} + +inline int32_t saturating_doubling_high_mul(const int32_t &a, const int32_t &b) +{ + return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0); +} + +inline int32x4_t rounding_divide_by_exp2(const int32x4_t &x, const int exponent) +{ + const int32x4_t shift = vdupq_n_s32(-exponent); + const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31); + const int32x4_t fixed = vqaddq_s32(x, fixup); + return vrshlq_s32(fixed, shift); +} + +inline int32x2_t rounding_divide_by_exp2(const int32x2_t &x, const int exponent) +{ + const int32x2_t shift = vdup_n_s32(-exponent); + const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31); + const int32x2_t fixed = vqadd_s32(x, fixup); + return vrshl_s32(fixed, shift); +} + +inline int32_t rounding_divide_by_exp2(const int32_t &x, const int exponent) +{ + const int32x2_t xs = vdup_n_s32(x); + return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0); +} + +inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation) +{ + const int32_t current_h = base_h + h * dilation.y(); + const bool is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height); + + const int32_t current_w = base_w + w * dilation.x(); + const bool is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width); + + return is_valid_h && is_valid_w; +} + +template <typename T> +void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info, + const Size2D &dilation, const Window &window, bool has_biases) +{ + constexpr auto element_per_vector = vector_size / sizeof(T); + using VectorType = typename wrapper::traits::neon_vector<T, element_per_vector>::type; + using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type; + + const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window); + + const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{}); + + Window execution_window = window; + execution_window.set(Window::DimX, dim_single_unit_step); + + Window win_input = window; + win_input.set(Window::DimX, dim_manual_loop); + win_input.set(Window::DimY, dim_manual_loop); + win_input.set(Window::DimZ, dim_manual_loop); + + Window win_weights = win_input; + win_weights.set(Window::DimW, dim_manual_loop); + + Window win_output = window; + win_output.set(Window::DimX, dim_manual_loop); + + Iterator input_it(src, win_input); + Iterator weights_it(weights, win_weights); + Iterator output_it(dst, win_output); + Iterator biases_it{}; + + if(has_biases) + { + biases_it = Iterator(biases, win_weights); + } + + execute_window_loop(execution_window, [&](const Coordinates & id) + { + const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; + const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; + const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; + + auto const base_weights_ptr = weights_it.ptr(); + uint32_t x = run_info.x_start; + + for(; x < run_info.x_leftover_start; x += run_info.x_step) + { + VectorType acc = zero_vector; + auto weights_ptr = base_weights_ptr; + int64_t input_offset = base_input_offset; + + for(uint32_t h = 0; h < run_info.weights_height; ++h) + { + int64_t offs = input_offset + x * sizeof(T); + for(uint32_t w = 0; w < run_info.weights_width; ++w) + { + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_vals = is_valid_region ? + wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : + zero_vector; + const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x); + acc = wrapper::vmla(acc, weights_vals, input_vals); + + offs += dilation.x() * run_info.input_stride_y; + } + + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; + } + + if(has_biases) + { + const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x); + acc = wrapper::vadd(acc, biases_vals); + } + + wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc); + } + + for(; x < run_info.x_end; ++x) + { + auto acc_scalar = T{ 0 }; + auto weights_ptr = base_weights_ptr; + int64_t input_offset = base_input_offset; + + for(size_t h = 0; h < run_info.weights_height; ++h) + { + int64_t offs = input_offset + x * sizeof(T); + for(size_t w = 0; w < run_info.weights_width; ++w) + { + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_vals = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0; + const auto weights_vals = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x); + + acc_scalar += (input_vals * weights_vals); + + offs += dilation.x() * run_info.input_stride_y; + } + + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; + } + + if(has_biases) + { + const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x); + acc_scalar += biases_vals; + } + *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar; + } + }, + input_it, weights_it, biases_it, output_it); +} + +template <typename T> +void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info, + const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases) +{ + const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); + + Window execution_window = window; + execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1)); + + Window win_input = execution_window; + win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1)); + win_input.set(Window::DimY, dim_manual_loop); + win_input.set(Window::DimZ, dim_manual_loop); + + Window win_weights = window; + win_weights.set_dimension_step(Window::DimX, run_info.x_step); + win_weights.set(Window::DimY, dim_manual_loop); + win_weights.set(Window::DimZ, dim_manual_loop); + win_weights.set(Window::DimW, dim_manual_loop); + + Window win_output = window; + win_output.set_dimension_step(Window::DimX, run_info.x_step); + + Iterator input_it(src, win_input); + Iterator weights_it(weights, win_weights); + Iterator output_it(dst, win_output); + Iterator biases_it{}; + + if(has_biases) + { + biases_it = Iterator(biases, win_weights); + } + + execute_window_loop(execution_window, [&](const Coordinates & id) + { + std::vector<T> acc(depth_multiplier, static_cast<T>(0)); + + const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; + const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; + int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; + + auto weights_ptr = weights_it.ptr(); + for(size_t h = 0; h < run_info.weights_height; ++h) + { + int offs = input_offset; + for(size_t w = 0; w < run_info.weights_width; ++w) + { + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0); + + for(size_t m = 0; m < depth_multiplier; ++m) + { + const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); + acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m)); + } + + offs += dilation.x() * run_info.input_stride_y; + } + + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; + } + + if(has_biases) + { + for(size_t m = 0; m < depth_multiplier; ++m) + { + const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T))); + *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val; + } + } + else + { + for(size_t m = 0; m < depth_multiplier; ++m) + { + *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m); + } + } + }, + input_it, weights_it, biases_it, output_it); +} + +template <typename T, typename TW> +void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info, + const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT +{ + ARM_COMPUTE_UNUSED(output_multiplier, output_shift); + constexpr auto element_per_vector = vector_size / sizeof(T); + using VectorType = typename wrapper::traits::neon_vector<T, element_per_vector>::type; + using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type; + using AccType = int32_t; + using AccArrayType = std::array<AccType, element_per_vector>; + + const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>(); + const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{}); + + const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window); + + const int32_t input_qoffset = src->info()->quantization_info().uniform().offset; + const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset; + const int32_t output_qoffset = dst->info()->quantization_info().uniform().offset; + const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset; + + Window execution_window = window; + execution_window.set(Window::DimX, dim_single_unit_step); + + Window win_input = window; + win_input.set(Window::DimX, dim_manual_loop); + win_input.set(Window::DimY, dim_manual_loop); + win_input.set(Window::DimZ, dim_manual_loop); + + Window win_weights = win_input; + win_weights.set(Window::DimW, dim_manual_loop); + + Window win_output = window; + win_output.set(Window::DimX, dim_manual_loop); + + Iterator input_it(src, win_input); + Iterator weights_it(weights, win_weights); + Iterator output_it(dst, win_output); + Iterator biases_it{}; + + if(has_biases) + { + biases_it = Iterator(biases, win_weights); + } + + execute_window_loop(execution_window, [&](const Coordinates & id) + { + const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; + const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; + const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; + auto const base_weights_ptr = weights_it.ptr(); + size_t x = run_info.x_start; + + for(; x < run_info.x_leftover_start; x += run_info.x_step) + { + AccArrayType acc{}; + AccArrayType in_sum{}; + AccArrayType we_sum{}; + + auto weights_ptr = base_weights_ptr; + auto input_offset = base_input_offset; + + for(size_t h = 0; h < run_info.weights_height; ++h) + { + int64_t offs = input_offset + x * sizeof(T); + for(size_t w = 0; w < run_info.weights_width; ++w) + { + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_vals = is_valid_region ? + wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : + out_of_bound_vector; + const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x); + + for(size_t i = 0; i < element_per_vector; ++i) + { + acc.at(i) += input_vals[i] * weights_vals[i]; + in_sum.at(i) += input_vals[i]; + we_sum.at(i) += weights_vals[i]; + } + + offs += dilation.x() * run_info.input_stride_y; + } + + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; + } + + VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{}); + for(size_t i = 0; i < element_per_vector; ++i) + { + acc.at(i) -= in_sum.at(i) * weights_qoffset; + acc.at(i) -= we_sum.at(i) * input_qoffset; + acc.at(i) += k_offset; + + if(has_biases) + { + acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x); + } + + const int32_t out_mul = output_multiplier.at(x + i); + const int32_t out_shift = output_shift.at(x + i); + if(out_shift < 0) + { + acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset; + } + else + { + acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset; + } + out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i))); + } + + wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals); + } + + // left-over + for(; x < run_info.x_end; ++x) + { + AccType acc = 0; + AccType in_sum = 0; + AccType we_sum = 0; + + auto weights_ptr = base_weights_ptr; + auto input_offset = base_input_offset; + + for(size_t h = 0; h < run_info.weights_height; ++h) + { + int64_t offs = input_offset + x * sizeof(T); + for(size_t w = 0; w < run_info.weights_width; ++w) + { + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_val = is_valid_region ? + *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : + out_of_bound_value; + const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x); + + acc += input_val * weights_val; + in_sum += input_val; + we_sum += weights_val; + + offs += dilation.x() * run_info.input_stride_y; + } + + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; + } + + T out_vals{ 0 }; + + acc -= in_sum * weights_qoffset; + acc -= we_sum * input_qoffset; + acc += k_offset; + + if(has_biases) + { + acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x); + } + + const int32_t out_mul = output_multiplier.at(x); + const int32_t out_shift = output_shift.at(x); + + if(out_shift < 0) + { + acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset; + } + else + { + acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset; + } + + out_vals = static_cast<T>(utility::clamp<AccType, T>(acc)); + *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals; + } + }, + input_it, weights_it, biases_it, output_it); +} + +template <typename T, typename TW> +void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info, + const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT +{ + using AccType = int32_t; + + const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); + + const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>(); + + const int32_t input_qoffset = src->info()->quantization_info().uniform().offset; + const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset; + const int32_t output_qoffset = dst->info()->quantization_info().uniform().offset; + const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset; + + Window execution_window = window; + execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1)); + + Window win_input = execution_window; + win_input.set(Window::DimY, dim_manual_loop); + win_input.set(Window::DimZ, dim_manual_loop); + + Window win_weights = window; + win_weights.set_dimension_step(Window::DimX, run_info.x_step); + win_weights.set(Window::DimY, dim_manual_loop); + win_weights.set(Window::DimZ, dim_manual_loop); + win_weights.set(Window::DimW, dim_manual_loop); + + Window win_output = window; + win_output.set_dimension_step(Window::DimX, run_info.x_step); + + Iterator input_it(src, win_input); + Iterator weights_it(weights, win_weights); + Iterator output_it(dst, win_output); + Iterator biases_it{}; + + if(has_biases) + { + biases_it = Iterator(biases, win_weights); + } + + execute_window_loop(execution_window, [&](const Coordinates & id) + { + std::vector<AccType> acc(depth_multiplier, 0); + std::vector<AccType> we_sum(depth_multiplier, 0); + AccType in_sum = 0; + + const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; + const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; + int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; + + auto weights_ptr = weights_it.ptr(); + for(size_t h = 0; h < run_info.weights_height; ++h) + { + int offs = input_offset; + for(size_t w = 0; w < run_info.weights_width; ++w) + { + const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation); + const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value; + + for(size_t m = 0; m < depth_multiplier; ++m) + { + const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); + acc.at(m) += input_val * weights_val; + + we_sum.at(m) += weights_val; + } + + offs += dilation.x() * run_info.input_stride_y; + in_sum += input_val; + } + + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; + } + + for(size_t m = 0; m < depth_multiplier; ++m) + { + acc.at(m) -= in_sum * weights_qoffset; + acc.at(m) -= we_sum.at(m) * input_qoffset; + acc.at(m) += k_offset; + + if(has_biases) + { + acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t))); + } + + const int32_t out_mul = output_multiplier.at(id.x() * depth_multiplier + m); + const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m); + if(out_shift < 0) + { + acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset; + } + else + { + acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset; + } + *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m))); + } + }, + input_it, weights_it, biases_it, output_it); +} + +template <typename T, typename TW> +void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info, + const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT +{ + constexpr int half_vec = vector_size / 2; + + using AccType = int32_t; + using AccVectorType = typename wrapper::traits::neon_vector<AccType, half_vec>::type; + using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type; + using TagType = typename wrapper::traits::neon_vector<T, vector_size>::tag_type; + + const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier); + + const auto input_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{}))); + const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{}))); + const auto output_qoffset_vec = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{}); + + const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{}); + const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{}); + const auto zero = wrapper::vdup_n(static_cast<AccType>(0), AccVectorTagType{}); + + const auto out_mul = output_multiplier.at(0); + const auto out_shift = output_shift.at(0); + + Window execution_window = window; + execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1)); + + Window win_input = execution_window; + win_input.set(Window::DimY, dim_manual_loop); + win_input.set(Window::DimZ, dim_manual_loop); + + Window win_weights = window; + win_weights.set_dimension_step(Window::DimX, run_info.x_step); + win_weights.set(Window::DimY, dim_manual_loop); + win_weights.set(Window::DimZ, dim_manual_loop); + win_weights.set(Window::DimW, dim_manual_loop); + + Window win_output = window; + win_output.set_dimension_step(Window::DimX, run_info.x_step); + + Iterator input_it(src, win_input); + Iterator weights_it(weights, win_weights); + Iterator output_it(dst, win_output); + Iterator biases_it{}; + + if(has_biases) + { + biases_it = Iterator(biases, win_weights); + } + + std::vector<AccVectorType> acc0(depth_multiplier / vector_size); + std::vector<AccVectorType> acc1(depth_multiplier / vector_size); + + execute_window_loop(execution_window, [&](const Coordinates & id) + { + std::fill(begin(acc0), end(acc0), zero); + std::fill(begin(acc1), end(acc1), zero); + + const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left; + const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top; + int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z; + + auto weights_ptr = weights_it.ptr(); + for(size_t h = 0; h < run_info.weights_height; ++h) + { + const int32_t current_h = input_z + h * dilation.y(); + if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height)) + { + int offs = input_offset; + for(size_t w = 0; w < run_info.weights_width; ++w) + { + const int32_t current_w = input_y + w * dilation.x(); + if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width)) + { + const auto input_8x8 = wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{}); + const auto input_s16x8 = wrapper::vreinterpret(wrapper::vmovl(input_8x8)); + const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec); + + for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i) + { + const auto weights_8x8 = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y)); + const auto weights_s16x8 = wrapper::vreinterpret(wrapper::vmovl(weights_8x8)); + const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec); + + acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs)); + acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs)); + } + } + + offs += dilation.x() * run_info.input_stride_y; + } + } + + weights_ptr += run_info.weights_stride_z; + input_offset += dilation.y() * run_info.input_stride_z; + } + + for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i) + { + if(has_biases) + { + const auto bias_val0 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t))); + const auto bias_val1 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t))); + + acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0); + acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1); + } + + if(out_shift < 0) + { + acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec); + acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec); + } + else + { + acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec); + acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec); + } + + acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper); + acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper); + + const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)), + wrapper::vmovn(acc1.at(i))); + + if(std::is_same<T, uint8_t>::value) + { + wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val))); + } + else + { + wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val)); + } + } + }, + input_it, weights_it, biases_it, output_it); +} + +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(info.depth_multiplier == 0); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom()); + ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(0) * info.depth_multiplier) != weights->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON((info.dilation.x() < 1) || (info.dilation.y() < 1)); + ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || (info.pad_stride_info.stride().second < 1)); + + if(is_data_type_quantized_per_channel(weights->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size()); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); + } + + if(biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0)); + + if(is_data_type_quantized_asymmetric(src->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); + } + } + + if(dst->total_size() != 0) + { + const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + } + + return Status{}; +} +} // namespace + +void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, (biases != nullptr) ? biases : nullptr, dst, info)); + + _conv_info = info.pad_stride_info; + _depth_multiplier = info.depth_multiplier; + _dilation = info.dilation; + _has_biases = (biases != nullptr); + + if(is_data_type_quantized(src->data_type())) + { + const auto input_scale = src->quantization_info().uniform().scale; + const auto output_scale = dst->quantization_info().uniform().scale; + + auto weights_scale = weights->quantization_info().scale(); + if(!is_data_type_quantized_per_channel(weights->data_type())) + { + for(size_t i = 1; i < weights->dimension(channel_idx); ++i) + { + weights_scale.push_back(weights_scale.front()); + } + } + + for(const auto &s : weights_scale) + { + int32_t out_mult = 0; + int32_t out_shift = 0; + const float multiplier = input_scale * s / output_scale; + arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift); + + _output_multiplier.push_back(out_mult); + _output_shift.push_back(out_shift); + } + } + + switch(weights->data_type()) + { + case DataType::QASYMM8: + _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<uint8_t, uint8_t>; + break; + case DataType::QASYMM8_SIGNED: + _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<int8_t, int8_t>; + break; + case DataType::QSYMM8_PER_CHANNEL: + if(src->data_type() == DataType::QASYMM8) + { + _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<uint8_t, int8_t>; + } + else + { + _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<int8_t, int8_t>; + } + break; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<float16_t, float16_t>; + break; +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F32: + _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<float, float>; + break; + default: + ARM_COMPUTE_ERROR("Data type not supported"); + break; + } + + const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); + auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(dst->quantization_info())); + + Window win = calculate_max_window(*dst, Steps()); + ICpuKernel::configure(win); +} + +Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, info)); + return Status{}; +} + +template <typename T, typename TW, CpuDepthwiseConv2dNativeKernel::FloatEnalber<T>> +void CpuDepthwiseConv2dNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases, + ITensor *dst, const Window &window, bool has_biases) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + if(_depth_multiplier == 1) + { + depthwise_loop_multiplier1_fp<T>(src, weights, biases, dst, _conv_info, _dilation, window, has_biases); + } + else + { + depthwise_loop_generic_fp<T>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, window, has_biases); + } +} + +template <typename T, typename TW, CpuDepthwiseConv2dNativeKernel::Quantized8bitEnalber<T>> +void CpuDepthwiseConv2dNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases, + ITensor *dst, const Window &window, bool has_biases) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + if(_depth_multiplier == 1) + { + depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases); + } + else + { + const bool is_pow2 = ((_depth_multiplier & (_depth_multiplier - 1)) == 0); + const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(weights->info()->data_type())); + + if(is_pow2 && is_quantized_per_tensor && _depth_multiplier >= 8) + { + depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases); + } + else + { + depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases); + } + } +} + +void CpuDepthwiseConv2dNativeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + const auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + (this->*_func)(src, weights, biases, dst, window, _has_biases); +} + +const char *CpuDepthwiseConv2dNativeKernel::name() const +{ + return "CpuDepthwiseConv2dNativeKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h new file mode 100644 index 0000000000..1afb6bed4c --- /dev/null +++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_NATIVE_KERNEL_H +#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_NATIVE_KERNEL_H + +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" +#include "support/Requires.h" + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#include <arm_neon.h> +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the kernel to run a depthwise convolution native on a tensor. */ +class CpuDepthwiseConv2dNativeKernel : public ICpuKernel +{ +public: + CpuDepthwiseConv2dNativeKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dNativeKernel); + + /** Initialize the function's source, destination and parameters. + * + * @note Supported data layouts: NHWC + * + * @param[in] src Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor. This is a 3D tensor with dimensions [IFM, W, H]. + * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. + * @param[in] biases Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. + * @param[out] dst Destination tensor. Data type supported: Same as @p src. + * @param[in] info Depthwise convolution meta-data. + * + */ + void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDepthwiseConv2dNativeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + template <typename T> + using FloatEnalber = typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, int>::type; + + template <typename T, typename TW, FloatEnalber<T> = 0> + void run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases); + + template <typename T> + using Quantized8bitEnalber = typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type; + + template <typename T, typename TW, Quantized8bitEnalber<T> = 0> + void run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases); + + /** Common signature for all the specialised depthwise convolution native functions + * + * @param[in] window Region on which to execute the kernel. + */ + using DepthwiseFunctionPtr = void (CpuDepthwiseConv2dNativeKernel::*)(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases); + + DepthwiseFunctionPtr _func{ nullptr }; + PadStrideInfo _conv_info{}; + unsigned int _depth_multiplier{ 1 }; + Size2D _dilation{}; + std::vector<int> _output_multiplier{}; + std::vector<int> _output_shift{}; + bool _has_biases{ false }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_NATIVE_KERNEL_H */ diff --git a/src/cpu/kernels/CpuDequantizeKernel.cpp b/src/cpu/kernels/CpuDequantizeKernel.cpp new file mode 100644 index 0000000000..a2d24f9243 --- /dev/null +++ b/src/cpu/kernels/CpuDequantizeKernel.cpp @@ -0,0 +1,400 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuDequantizeKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NESymm.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16); + + if(dst->tensor_shape().total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + + return Status{}; +} + +template <typename T> +inline void store_result(T *ptr, const float32x4x4_t &v) +{ + ARM_COMPUTE_UNUSED(ptr, v); +} + +template <> +inline void store_result<float>(float *ptr, const float32x4x4_t &v) +{ + wrapper::vstore(ptr, v.val[0]); + wrapper::vstore(ptr + 4, v.val[1]); + wrapper::vstore(ptr + 8, v.val[2]); + wrapper::vstore(ptr + 12, v.val[3]); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> +inline void store_result<float16_t>(float16_t *ptr, const float32x4x4_t &v) +{ + wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1]))); + wrapper::vstore(ptr + 8, vcombine_f16(vcvt_f16_f32(v.val[2]), vcvt_f16_f32(v.val[3]))); +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +template <typename T> +inline void store_result(T *ptr, const float32x4x2_t &v) +{ + ARM_COMPUTE_UNUSED(ptr, v); +} + +template <> +inline void store_result<float>(float *ptr, const float32x4x2_t &v) +{ + wrapper::vstore(ptr, v.val[0]); + wrapper::vstore(ptr + 4, v.val[1]); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> +inline void store_result<float16_t>(float16_t *ptr, const float32x4x2_t &v) +{ + wrapper::vstore(ptr, vcombine_f16(vcvt_f16_f32(v.val[0]), vcvt_f16_f32(v.val[1]))); +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +template <typename TOut, typename TIn> +void run_dequantization_qasymm8(const ITensor *input, ITensor *output, const Window &window) +{ + const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform(); + const float scale = qinfo.scale; + const int32_t offset = qinfo.offset; + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(input, win_collapsed); + Iterator out(output, win_collapsed); + + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const TIn *>(in.ptr()); + const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize(vin, scale, offset); + + store_result(reinterpret_cast<TOut *>(out_ptr + x), vdeq); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + auto val = *(in_ptr + x); + *(out_ptr + x) = static_cast<TOut>(Qasymm8QuantizationHelper<TIn>::dequantize(val, qinfo)); + } + }, + in, out); +} + +template <typename T> +void run_dequantization_qsymm8_per_channel_nchw(const ITensor *input, ITensor *output, const Window &window) +{ + const auto scale = input->info()->quantization_info().scale(); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Reset first dimension to handle tail calculations manually + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(input, win); + Iterator out(output, win); + + execute_window_loop(win, [&](const Coordinates & id) + { + const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr()); + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize(vin, scale[id.z()]); + + store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + int8_t val = *(in_ptr + x); + *(out_ptr + x) = static_cast<T>(dequantize(val, scale[id.z()])); + } + }, + in, out); +} + +template <typename T> +void run_dequantization_qsymm8_per_channel_nhwc(const ITensor *input, ITensor *output, const Window &window) +{ + const auto scale = input->info()->quantization_info().scale(); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Reset first dimension to handle tail calculations manually + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(input, win); + Iterator out(output, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr()); + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t vscale = + { + { + scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3], + scale[x + 4], scale[x + 5], scale[x + 6], scale[x + 7], + scale[x + 8], scale[x + 9], scale[x + 10], scale[x + 11], + scale[x + 12], scale[x + 13], scale[x + 14], scale[x + 15] + } + }; + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize(vin, vscale); + + store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + int8_t val = *(in_ptr + x); + *(out_ptr + x) = static_cast<T>(dequantize(val, scale[x])); + } + }, + in, out); +} + +template <typename T> +void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Window &window) +{ + const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform(); + const float scale = qinfo.scale; + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(input, win_collapsed); + Iterator out(output, win_collapsed); + + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const int8_t *>(in.ptr()); + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize(vin, scale); + + store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + int8_t val = *(in_ptr + x); + *(out_ptr + x) = static_cast<T>(dequantize(val, scale)); + } + }, + in, out); +} + +template <typename T> +void run_dequantization_qsymm16(const ITensor *input, ITensor *output, const Window &window) +{ + const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform(); + const float scale = qinfo.scale; + + const int window_step_x = 8; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(input, win_collapsed); + Iterator out(output, win_collapsed); + + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const int16_t *>(in.ptr()); + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize_int16(vin, scale); + + store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + int16_t val = *(in_ptr + x); + *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale)); + } + }, + in, out); +} + +template <typename T> +void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window) +{ + switch(input->info()->data_type()) + { + case DataType::QASYMM8: + run_dequantization_qasymm8<T, uint8_t>(input, output, window); + break; + case DataType::QASYMM8_SIGNED: + run_dequantization_qasymm8<T, int8_t>(input, output, window); + break; + case DataType::QSYMM8_PER_CHANNEL: + input->info()->data_layout() == DataLayout::NHWC ? run_dequantization_qsymm8_per_channel_nhwc<T>(input, output, window) : run_dequantization_qsymm8_per_channel_nchw<T>(input, output, window); + break; + case DataType::QSYMM8: + run_dequantization_qsymm8<T>(input, output, window); + break; + case DataType::QSYMM16: + run_dequantization_qsymm16<T>(input, output, window); + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } +} +} // namespace + +void CpuDequantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); + + // Configure kernel window + Window win = calculate_max_window(*src, Steps()); + + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32); + + ICpuKernel::configure(win); +} + +Status CpuDequantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst)); + return Status{}; +} + +void CpuDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + switch(dst->info()->data_type()) + { + case DataType::F32: + run_dequantization_core<float>(src, dst, window); + break; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + run_dequantization_core<float16_t>(src, dst, window); + break; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } +} +const char *CpuDequantizeKernel::name() const +{ + return "CpuDequantizeKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuDequantizeKernel.h b/src/cpu/kernels/CpuDequantizeKernel.h new file mode 100644 index 0000000000..f515cd36f9 --- /dev/null +++ b/src/cpu/kernels/CpuDequantizeKernel.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DEQUANTIZE_KERNEL_H +#define ARM_COMPUTE_CPU_DEQUANTIZE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the dequantization layer kernel. */ +class CpuDequantizeKernel : public ICpuKernel +{ +public: + CpuDequantizeKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDequantizeKernel); + /** Set input, output tensors. + * + * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16. + * @param[out] dst Destination tensor info with the same dimensions of input. Data type supported: F16/F32. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuDequantizeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_DEQUANTIZE_KERNEL_H */ diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.cpp b/src/cpu/kernels/CpuDirectConv2dKernel.cpp new file mode 100644 index 0000000000..db1b5f3c54 --- /dev/null +++ b/src/cpu/kernels/CpuDirectConv2dKernel.cpp @@ -0,0 +1,1385 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuDirectConv2dKernel.h" + +#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <algorithm> + +using namespace arm_compute::detail; + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <unsigned int stridex> +float16x8_t internal_vld1q(const float16_t *in); + +template <> +float16x8_t internal_vld1q<1>(const float16_t *in) +{ + return vld1q_f16(in); +} + +template <> +float16x8_t internal_vld1q<2>(const float16_t *in) +{ + const float16x8x2_t tmp = vld2q_f16(in); + return tmp.val[0]; +} + +template <> +float16x8_t internal_vld1q<3>(const float16_t *in) +{ + const float16x8x3_t tmp = vld3q_f16(in); + return tmp.val[0]; +} + +inline float16x8_t internal_vdupq_n(float16_t v) +{ + return vdupq_n_f16(v); +} + +inline void internal_vst1q(float16_t *p, const float16x8_t &v) +{ + vst1q_f16(p, v); +} + +float16x8_t internal_vmull(const float16x8_t &x, const float16x8_t &y) +{ + return vmulq_f16(x, y); +} + +inline float16x8_t internal_vmlal(const float16x8_t &x, const float16x8_t &y, const float16x8_t &z) +{ + return vaddq_f16(x, vmulq_f16(y, z)); +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +template <unsigned int stridex> +float32x4_t internal_vld1q(const float *in); + +template <> +float32x4_t internal_vld1q<1>(const float *in) +{ + return vld1q_f32(in); +} + +template <> +float32x4_t internal_vld1q<2>(const float *in) +{ + const float32x4x2_t tmp = vld2q_f32(in); + return tmp.val[0]; +} + +template <> +float32x4_t internal_vld1q<3>(const float *in) +{ + const float32x4x3_t tmp = vld3q_f32(in); + return tmp.val[0]; +} + +inline float32x4_t internal_vdupq_n(float v) +{ + return vdupq_n_f32(v); +} + +inline void internal_vst1q(float *p, const float32x4_t &v) +{ + vst1q_f32(p, v); +} + +float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y) +{ + return vmulq_f32(x, y); +} + +inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z) +{ + return vmlaq_f32(x, y, z); +} + +constexpr int small_tensor_size_optim = 8; +inline bool run_optim_small_tensor_info(const ITensorInfo *t) +{ + return t->dimension(Window::DimX) <= small_tensor_size_optim && t->dimension(Window::DimY) <= small_tensor_size_optim; +} + +inline bool run_optim_small_tensor(const ITensor *t) +{ + return run_optim_small_tensor_info(t->info()); +} + +// Optimized convolver for 1x1 kernels used only where input width and height are both <= 8 +// For big Z as in Input=7x7x832, this implementation is faster than the general code becuase it doesn't need to +// store intermidiate results in memory. Temporary results are stored in SIMD registers directly and then written to the output buffer. +template <unsigned int stridex> +class convolver_w1x1_i8x8_f32 +{ +public: + static void convolve(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) + { + ARM_COMPUTE_ERROR_ON(src->info()->dimension(Window::DimX) > small_tensor_size_optim); + ARM_COMPUTE_ERROR_ON(src->info()->dimension(Window::DimY) > small_tensor_size_optim); + + const int input_stride_x = src->info()->strides_in_bytes().x(); + const int input_stride_y = src->info()->strides_in_bytes().y(); + const int input_stride_z = src->info()->strides_in_bytes().z(); + const int output_stride_y = dst->info()->strides_in_bytes().y(); + const int output_stride_z = dst->info()->strides_in_bytes().z(); + const int kernel_stride_z = weights->info()->strides_in_bytes().z(); + const int kernel_stride_w = weights->info()->strides_in_bytes()[3]; + const int output_h = dst->info()->dimension(1); + const int range_z = window.z().end() - window.z().start(); + const int kernel_depth = weights->info()->dimension(Window::DimZ); + const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); + const unsigned int conv_pad_left = conv_info.pad_left(); + const unsigned int conv_pad_top = conv_info.pad_top(); + + // setup output window for the iterator + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX))); + window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY))); + window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z)); + + // setup input window for the iterator + Window window_in = window; + // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0 + window_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Window window_k = calculate_max_window(*weights->info(), Steps(1u)); + Iterator out(dst, window_out); + Iterator in(src, window_in); + Iterator k(weights, window_k); + + const uint8_t *k_ptr = k.ptr(); + + execute_window_loop(window_out, [&](const Coordinates & id) + { + const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y; + uint8_t *out_ptr = out.ptr(); + int ih = 0; + int oh = 0; + std::array<float32x4_t, 8> accum0 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; + std::array<float32x4_t, 8> accum1 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; + for(int oz = 0; oz < range_z; ++oz) + { + accum0[0] = accum0[1] = accum0[2] = accum0[3] = accum0[4] = accum0[5] = accum0[6] = accum0[7] = vdupq_n_f32(0.f); + accum1[0] = accum1[1] = accum1[2] = accum1[3] = accum1[4] = accum1[5] = accum1[6] = accum1[7] = vdupq_n_f32(0.f); + auto p_out_base = out_ptr + oz * output_stride_z; + for(int p = 0; p < kernel_depth; ++p) + { + const auto k_val = reinterpret_cast<const float *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w); + const auto vk0 = internal_vdupq_n(*k_val); + for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y) + { + const int offset_xy = ih * input_stride_y; + auto in_val = reinterpret_cast<const float *>(input_ptr + p * input_stride_z + offset_xy); + auto v_in0 = internal_vld1q<stridex>(in_val); + auto v_in1 = internal_vld1q<stridex>(in_val + 4); + accum0[oh] = vmlaq_f32(accum0[oh], vk0, v_in0); + accum1[oh] = vmlaq_f32(accum1[oh], vk0, v_in1); + } + } + for(oh = 0; oh < output_h; ++oh) + { + auto p_out = reinterpret_cast<float *>(p_out_base + oh * output_stride_y); + vst1q_f32(p_out, accum0[oh]); + vst1q_f32(p_out + 4, accum1[oh]); + } + } + }, + in, out); + } +}; + +template <typename T1, typename T2, unsigned int stridex> +class convolver_1x1 +{ +public: + static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, + const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) + { + const int input_stride_x = src->info()->strides_in_bytes().x(); + const int input_stride_y = src->info()->strides_in_bytes().y(); + const int input_stride_z = src->info()->strides_in_bytes().z(); + const int output_stride_y = dst->info()->strides_in_bytes().y(); + const int output_stride_z = dst->info()->strides_in_bytes().z(); + const int kernel_stride_z = weights->info()->strides_in_bytes().z(); + const int kernel_stride_w = weights->info()->strides_in_bytes()[3]; + const int output_w = dst->info()->dimension(0); + const int output_h = dst->info()->dimension(1); + const int range_z = window.z().end() - window.z().start(); + const int kernel_depth = weights->info()->dimension(Window::DimZ); + const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); + const unsigned int conv_pad_left = conv_info.pad_left(); + const unsigned int conv_pad_top = conv_info.pad_top(); + + // setup output window for the iterator + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX))); + window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY))); + window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z)); + + // setup input window for the iterator + Window window_in = window; + // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0 + window_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Window window_k = calculate_max_window(*weights->info(), Steps(1u)); + Iterator out(dst, window_out); + Iterator in(src, window_in); + Iterator k(weights, window_k); + + const uint8_t *k_ptr = k.ptr(); + + execute_window_loop(window_out, [&](const Coordinates & id) + { + /* + For a detailed explanation on how the algorithm works refer to template <> class convolver_3x3<1> + */ + const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y; + uint8_t *out_ptr = out.ptr(); + int ih = 0; + int oh = 0; + for(int oz = 0; oz < range_z; ++oz) + { + auto p_out_base = out_ptr + oz * output_stride_z; + // Step 1 + { + const auto k_val = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w); + const auto vk = internal_vdupq_n(*k_val); + for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y) + { + const int offset_xy = ih * input_stride_y; + auto in_val = reinterpret_cast<const T1 *>(input_ptr + (0 * input_stride_z + offset_xy)); + auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y); + for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration) + { + internal_vst1q(p_out, internal_vmull(vk, internal_vld1q<stridex>(in_val))); + } + } + } + + // Step 2 + for(int p = 1; p < kernel_depth; ++p) + { + const auto k_val = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w); + const auto vk = internal_vdupq_n(*k_val); + for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y) + { + const int offset_xy = ih * input_stride_y; + auto in_val = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + offset_xy); + auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y); + for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration) + { + internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q<stridex>(in_val))); + } + } + } + } + }, + in, out); + } +}; + +template <unsigned int stridex> +float32x4x2_t convolve_5x5(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4, + const float *m0, const float *m1, const float *m2, const float *m3, const float *m4); + +inline float32x4x3_t load_matrix_hi(const float *const m0, const float *const m1, const float *const m2) +{ + const float32x4x3_t m00 = + { + { + vld1q_dup_f32(m0), + vld1q_dup_f32(m1), + vld1q_dup_f32(m2) + } + }; + return m00; +} + +inline float32x4x2_t load_matrix_lo(const float *const m3, const float *const m4) +{ + const float32x4x2_t m00 = + { + { + vld1q_dup_f32(m3), + vld1q_dup_f32(m4) + } + }; + return m00; +} + +inline float32x4x3_t load_input(const float *const in) +{ + const float32x4x3_t vin = + { + { + vld1q_f32(in), + vld1q_f32(in + 4), + vld1q_f32(in + 8) + } + }; + return vin; +} + +template <> +inline float32x4x2_t convolve_5x5<1>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4, + const float *m0, const float *m1, const float *m2, const float *m3, const float *m4) +{ + const float32x4x3_t vin0 = load_input(in_0); + const float32x4x3_t vin1 = load_input(in_1); + const float32x4x3_t vin2 = load_input(in_2); + const float32x4x3_t vin3 = load_input(in_3); + const float32x4x3_t vin4 = load_input(in_4); + const float32x4x3_t m00 = load_matrix_hi(m0, 1 + m0, 2 + m0); + const float32x4x2_t m01 = load_matrix_lo(3 + m0, 4 + m0); + const float32x4x3_t m10 = load_matrix_hi(m1, 1 + m1, 2 + m1); + const float32x4x2_t m11 = load_matrix_lo(3 + m1, 4 + m1); + const float32x4x3_t m20 = load_matrix_hi(m2, 1 + m2, 2 + m2); + const float32x4x2_t m21 = load_matrix_lo(3 + m2, 4 + m2); + const float32x4x3_t m30 = load_matrix_hi(m3, 1 + m3, 2 + m3); + const float32x4x2_t m31 = load_matrix_lo(3 + m3, 4 + m3); + const float32x4x3_t m40 = load_matrix_hi(m4, 1 + m4, 2 + m4); + const float32x4x2_t m41 = load_matrix_lo(3 + m4, 4 + m4); + + float32x4x2_t out = + { + { + vmulq_f32(vin0.val[0], m00.val[0]), + vmulq_f32(vin0.val[1], m00.val[0]) + } + }; + + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 1), m00.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 2), m00.val[2]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 3), m01.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vin0.val[1], m01.val[1]); + + out.val[0] = vmlaq_f32(out.val[0], vin1.val[0], m10.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 1), m10.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 2), m10.val[2]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 3), m11.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vin1.val[1], m11.val[1]); + + out.val[0] = vmlaq_f32(out.val[0], vin2.val[0], m20.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 1), m20.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 2), m20.val[2]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 3), m21.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vin2.val[1], m21.val[1]); + + out.val[0] = vmlaq_f32(out.val[0], vin3.val[0], m30.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 1), m30.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 2), m30.val[2]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 3), m31.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vin3.val[1], m31.val[1]); + + out.val[0] = vmlaq_f32(out.val[0], vin4.val[0], m40.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 1), m40.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 2), m40.val[2]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 3), m41.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vin4.val[1], m41.val[1]); + + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 1), m00.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 2), m00.val[2]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 3), m01.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vin0.val[2], m01.val[1]); + + out.val[1] = vmlaq_f32(out.val[1], vin1.val[1], m10.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 1), m10.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 2), m10.val[2]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 3), m11.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vin1.val[2], m11.val[1]); + + out.val[1] = vmlaq_f32(out.val[1], vin2.val[1], m20.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 1), m20.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 2), m20.val[2]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 3), m21.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vin2.val[2], m21.val[1]); + + out.val[1] = vmlaq_f32(out.val[1], vin3.val[1], m30.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 1), m30.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 2), m30.val[2]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 3), m31.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vin3.val[2], m31.val[1]); + + out.val[1] = vmlaq_f32(out.val[1], vin4.val[1], m40.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 1), m40.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 2), m40.val[2]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 3), m41.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vin4.val[2], m41.val[1]); + + return out; +} + +template <> +inline float32x4x2_t convolve_5x5<2>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4, + const float *m0, const float *m1, const float *m2, const float *m3, const float *m4) +{ + float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3); + return out; +} + +template <> +inline float32x4x2_t convolve_5x5<3>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4, + const float *m0, const float *m1, const float *m2, const float *m3, const float *m4) +{ + float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); + return out; +} + +template <typename T1, typename T2, unsigned int stridex> +class convolver_3x3 +{ +public: + static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, + const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) + { + ARM_COMPUTE_UNUSED(num_elems_read_per_iteration); + const int input_stride_x = src->info()->strides_in_bytes().x(); + const int input_stride_y = src->info()->strides_in_bytes().y(); + const int input_stride_z = src->info()->strides_in_bytes().z(); + const int output_stride_y = dst->info()->strides_in_bytes().y(); + const int output_stride_z = dst->info()->strides_in_bytes().z(); + const int kernel_stride_x = weights->info()->strides_in_bytes().x(); + const int kernel_stride_y = weights->info()->strides_in_bytes().y(); + const int kernel_stride_z = weights->info()->strides_in_bytes().z(); + const int kernel_stride_w = weights->info()->strides_in_bytes()[3]; + const int output_w = dst->info()->dimension(0); + const int output_h = dst->info()->dimension(1); + const int num_planes_z = window.z().end() - window.z().start(); + const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration, stridex); + const int kernel_depth = weights->info()->dimension(Window::DimZ); + const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); + const unsigned int conv_pad_left = conv_info.pad_left(); + const unsigned int conv_pad_top = conv_info.pad_top(); + + // setup output window for the iterator + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX))); + window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY))); + window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z)); + + // setup input window for the iterator + Window window_in = window; + // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0 + window_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Window window_k = calculate_max_window(*weights->info(), Steps(1u)); + + Iterator out(dst, window_out); + Iterator in(src, window_in); + Iterator k(weights, window_k); + + const uint8_t *k_ptr = k.ptr(); + + execute_window_loop(window_out, [&](const Coordinates & id) + { + const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y; + uint8_t *out_ptr = out.ptr(); + int ih = 0; + int oh = 0; + /* + Each thread executing this kernel computes one or more output's volume planes. + + Let's say the 3rd dimension of the output volume is 32, the first thread will compute the output for Z = [0,7], the second thread will compute the output for Z = [8,15], + the third thread [16,24] and the fourth thread [25,31]. + + The algorithm outer loop iterates over Z, P, Y, X where P is the depth/3rd dimension of each kernel. This order is not arbitrary, the main benefit of this + is that we setup the neon registers containing the kernel's values only once and then compute each XY using the preloaded registers as opposed as doing this for every XY value. + + The algorithm does not require allocating any additional memory amd computes the results directly in-place in two stages: + 1) Convolve plane 0 with kernel 0 and initialize the corresponding output plane with these values. + 2) Convolve the remaining planes and accumulate the results in the output's plane which has been initialized in step 1. + */ + for(int oz = 0; oz < num_planes_z; ++oz) + { + const int zoffset = id.z() + oz; + uint8_t *p_out_base = out_ptr + oz * output_stride_z; + // Step 1 + { + const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x); + const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x); + const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x); + const auto vk_r0 = load_matrix_row(ptr_k_r0); + const auto vk_r1 = load_matrix_row(ptr_k_r1); + const auto vk_r2 = load_matrix_row(ptr_k_r2); + for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y) + { + auto in_top = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y); + auto in_mid = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y); + auto in_low = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y); + auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y); + for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, + in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration) + { + convolve_3x3<false>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex); + } + } + } + // Step 2 + for(int p = 1; p < kernel_depth; ++p) + { + const uint8_t *ptr_k_base = k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w; + const uint8_t *input_base = input_ptr + p * input_stride_z; + const auto ptr_k_r0 = reinterpret_cast<const T1 *>(ptr_k_base); + const auto ptr_k_r1 = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y); + const auto ptr_k_r2 = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y * 2); + const auto vk_r0 = load_matrix_row(ptr_k_r0); + const auto vk_r1 = load_matrix_row(ptr_k_r1); + const auto vk_r2 = load_matrix_row(ptr_k_r2); + for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y) + { + auto in_top = reinterpret_cast<const T1 *>(input_base + (ih + 0) * input_stride_y); + auto in_mid = reinterpret_cast<const T1 *>(input_base + (ih + 1) * input_stride_y); + auto in_low = reinterpret_cast<const T1 *>(input_base + (ih + 2) * input_stride_y); + auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y); + for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, + in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration) + { + convolve_3x3<true>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex); + } + } + } + } + }, + in, out); + } +}; + +template <typename T1, typename T2, unsigned int stridex> +class convolver_5x5 +{ +public: + static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, + const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) + { + ARM_COMPUTE_UNUSED(num_elems_read_per_iteration); + const int input_stride_x = src->info()->strides_in_bytes().x(); + const int input_stride_y = src->info()->strides_in_bytes().y(); + const int input_stride_z = src->info()->strides_in_bytes().z(); + const int output_stride_y = dst->info()->strides_in_bytes().y(); + const int output_stride_z = dst->info()->strides_in_bytes().z(); + const int kernel_stride_x = weights->info()->strides_in_bytes().x(); + const int kernel_stride_y = weights->info()->strides_in_bytes().y(); + const int kernel_stride_z = weights->info()->strides_in_bytes().z(); + const int kernel_stride_w = weights->info()->strides_in_bytes()[3]; + const int output_w = dst->info()->dimension(0); + const int output_h = dst->info()->dimension(1); + const int num_planes_z = window.z().end() - window.z().start(); + const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration, stridex); + const int kernel_depth = weights->info()->dimension(Window::DimZ); + const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); + const unsigned int conv_pad_left = conv_info.pad_left(); + const unsigned int conv_pad_top = conv_info.pad_top(); + + // setup output window for the iterator + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX))); + window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY))); + window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z)); + + // setup input window for the iterator + Window window_in = window; + // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0 + window_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Window window_k = calculate_max_window(*weights->info(), Steps(1u)); + + Iterator out(dst, window_out); + Iterator in(src, window_in); + Iterator k(weights, window_k); + + const uint8_t *k_ptr = k.ptr(); + + execute_window_loop(window_out, [&](const Coordinates & id) + { + const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y; + uint8_t *out_ptr = out.ptr(); + int ih = 0; + int oh = 0; + for(int oz = 0; oz < num_planes_z; ++oz) + { + const int zoffset = id.z() + oz; + uint8_t *p_out_base = out_ptr + oz * output_stride_z; + // Step 1 + { + const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x); + const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x); + const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x); + const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x); + const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x); + for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y) + { + auto in_0 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y); + auto in_1 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y); + auto in_2 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y); + auto in_3 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 3) * input_stride_y); + auto in_4 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 4) * input_stride_y); + auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y); + for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, + in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration) + { + auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4); + store_results<stridex>(p_out, vres); + } + } + } + // Step 2 + for(int p = 1; p < kernel_depth; ++p) + { + const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x); + const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x); + const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x); + const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x); + const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x); + + for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y) + { + auto in_0 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y); + auto in_1 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y); + auto in_2 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y); + auto in_3 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 3) * input_stride_y); + auto in_4 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 4) * input_stride_y); + auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y); + for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, + in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration) + { + auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4); + accumulate_results<stridex>(p_out, vres); + } + } + } + } + }, + in, out); + } +}; + +float vreduce(const float32x4_t &v) +{ + auto v0 = wrapper::vgethigh(v); + auto v1 = wrapper::vgetlow(v); + auto v_out = wrapper::vadd(v0, v1); + + float a = wrapper::vgetlane(v_out, 0); + float b = wrapper::vgetlane(v_out, 1); + return a + b; +} + +template <typename T1, typename T2> +inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, + const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +{ + const unsigned int conv_stride_x = std::get<0>(conv_info.stride()); + switch(conv_stride_x) + { + case 1: + convolver_1x1<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); + break; + case 2: + convolver_1x1<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); + break; + case 3: + convolver_1x1<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); + break; + default: + ARM_COMPUTE_ERROR("Not implemented"); + } +} + +template <> +inline void convolve_1x1<float, float>(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, + const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +{ + const unsigned int conv_stride_x = std::get<0>(conv_info.stride()); + if(run_optim_small_tensor(src)) + { + switch(conv_stride_x) + { + case 1: + convolver_w1x1_i8x8_f32<1>::convolve(window, src, weights, dst, conv_info); + break; + case 2: + convolver_w1x1_i8x8_f32<2>::convolve(window, src, weights, dst, conv_info); + break; + case 3: + convolver_w1x1_i8x8_f32<3>::convolve(window, src, weights, dst, conv_info); + break; + default: + ARM_COMPUTE_ERROR("Not implemented"); + } + } + else + { + switch(conv_stride_x) + { + case 1: + convolver_1x1<float, float, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); + break; + case 2: + convolver_1x1<float, float, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); + break; + case 3: + convolver_1x1<float, float, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); + break; + default: + ARM_COMPUTE_ERROR("Not implemented"); + } + } +} + +template <typename T1, typename T2> +inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, + const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +{ + const unsigned int conv_stride_x = std::get<0>(conv_info.stride()); + switch(conv_stride_x) + { + case 1: + convolver_3x3<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); + break; + case 2: + convolver_3x3<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); + break; + case 3: + convolver_3x3<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); + break; + default: + ARM_COMPUTE_ERROR("Not implemented"); + } +} + +template <typename T1, typename T2> +inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, + const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +{ + const unsigned int conv_stride_x = std::get<0>(conv_info.stride()); + switch(conv_stride_x) + { + case 1: + convolver_5x5<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); + break; + case 2: + convolver_5x5<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); + break; + case 3: + convolver_5x5<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info); + break; + default: + ARM_COMPUTE_ERROR("Not implemented"); + } +} + +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); + + const DataLayout data_layout = src->data_layout(); + const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported."); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(channel_idx) != src->dimension(channel_idx)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); + ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && src->data_type() != DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(width_idx) > 3) && (src->data_type() == DataType::F16)); + + // Checks performed when output is configured + if(dst->total_size() != 0) + { + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); + + DataType data_type = src->data_type(); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape); + ARM_COMPUTE_RETURN_ERROR_ON(dst->data_type() != data_type); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info, unsigned int &num_weight_elems_read_per_row, + unsigned int &num_elems_read_per_iteration, unsigned int &num_elems_written_per_iteration, BorderSize &border_size) +{ + ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); + + const DataLayout data_layout = src->data_layout(); + const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + + // Calculate right and bottom border + unsigned int kernel_size = weights->dimension(width_idx); + const int conv_stride_x = std::get<0>(conv_info.stride()); + const int conv_stride_y = std::get<1>(conv_info.stride()); + const int input_width = src->dimension(width_idx); + + Window win{}; + bool window_changed = false; + + if(data_layout == DataLayout::NCHW) + { + switch(kernel_size) + { + case 1: + { + switch(src->data_type()) + { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + num_elems_written_per_iteration = 8; + break; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + case DataType::F32: + if(run_optim_small_tensor_info(src)) + { + num_elems_written_per_iteration = 8; + } + else + { + num_elems_written_per_iteration = 4; + } + break; + default: + ARM_COMPUTE_ERROR("Data type not supported."); + break; + } + num_weight_elems_read_per_row = kernel_size; + num_elems_read_per_iteration = conv_stride_x * num_elems_written_per_iteration; + break; + } + case 3: + switch(src->data_type()) + { + case DataType::F32: + num_weight_elems_read_per_row = 4 + kernel_size - 1; + num_elems_read_per_iteration = 12; + num_elems_written_per_iteration = 16 >> conv_stride_x; + break; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + num_weight_elems_read_per_row = 8 + kernel_size - 1; + num_elems_read_per_iteration = 24; + num_elems_written_per_iteration = 32 >> conv_stride_x; + break; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + default: + ARM_COMPUTE_ERROR("Data type not supported."); + break; + } + break; + case 5: + { + switch(src->data_type()) + { + case DataType::F32: + num_weight_elems_read_per_row = 4 + kernel_size - 1; + num_elems_read_per_iteration = 12; + num_elems_written_per_iteration = 16 >> conv_stride_x; + break; + default: + ARM_COMPUTE_ERROR("Data type not supported."); + break; + } + } + break; + default: + { + ARM_COMPUTE_ERROR("Not implemented"); + break; + } + } + + // Calculate right pad + int start_x = kernel_size / 2 - static_cast<int>(conv_info.pad_left()); + int end_x = ceil_to_multiple(static_cast<int>(dst->dimension(0)), num_elems_written_per_iteration) * conv_stride_x; + int upper_bound_w = ceil_to_multiple(start_x + end_x, num_elems_read_per_iteration) - input_width; + + // Calculate border + const unsigned int conv_pad_left = conv_info.pad_left(); + const unsigned int conv_pad_top = conv_info.pad_top(); + const unsigned int conv_pad_right = std::max(upper_bound_w, 0); + const unsigned int conv_pad_bottom = conv_info.pad_bottom(); + + border_size.left = conv_pad_left; + border_size.top = conv_pad_top; + border_size.right = conv_pad_right; + border_size.bottom = conv_pad_bottom; + + // Configure window + win = calculate_max_window(*dst, Steps(num_elems_written_per_iteration)); + + AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top, + num_elems_read_per_iteration, kernel_size, + conv_stride_x, conv_stride_y); + AccessWindowStatic weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size); + AccessWindowHorizontal output_access(dst, 0, num_elems_written_per_iteration); + window_changed = update_window_and_padding(win, input_access, weights_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape())); + } + else + { + // Configure window NHWC without any padding + win = calculate_max_window(*dst, Steps()); + } + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} + +bool have_zero_x_internal_padding(ITensorInfo *src, const ITensorInfo *weights) +{ + return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && weights->padding().right == 0); +} + +} // namespace + +template <typename T> +void CpuDirectConv2dKernel::convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst) +{ + // This function assumes that input and weights have not padding in channel + + // Declare useful types + using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>; + using vector_type = typename vtype::type; + using tag_type = typename vtype::tag_type; + + // Scalar quantities + const int element_size = src->info()->element_size(); + const int input_stride_w = src->info()->strides_in_bytes().y() / element_size; + const int input_stride_h = src->info()->strides_in_bytes().z() / element_size; + const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size; + const int input_dim_w = src->info()->dimension(1); + const int input_dim_h = src->info()->dimension(2); + + const int output_stride_c = dst->info()->strides_in_bytes().x(); + + const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size; + const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size; + const int kernel_dim_w = weights->info()->dimension(1); + const int kernel_dim_h = weights->info()->dimension(2); + + const int conv_pad_top = _conv_info.pad_top(); + const int conv_pad_left = _conv_info.pad_left(); + const int conv_stride_w = std::get<0>(_conv_info.stride()); + const int conv_stride_h = std::get<1>(_conv_info.stride()); + + // Setup input window for the output iterator + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Setup input window for the weights iterator + Window window_w = calculate_max_window(*weights->info(), Steps()); + window_w.set(Window::DimX, Window::Dimension(0, 1, 1)); + window_w.set(Window::DimY, Window::Dimension(0, 1, 1)); + window_w.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + Iterator out(dst, window_out); + Iterator wei(weights, window_w); + + constexpr int num_elems_read_per_iteration = 16 / sizeof(T); + /* + * This implementation parallelize the full WC plane of input and weights by + * treating them as series of elements. So for example, a 3x3 weights and + * floating point vector operations of 4 elements per time, the first 3 + * channel elements of the first row would be taken and additionally the first + * element of the second row. The 9 elements in each single WC weight plane + * would require 2 4-element vector operations and a last single element operation. + * + * This works since when we create the input vector to multiply with the weights, + * the exact required elements are loaded in the same order. Therefore the + * multiplication works on the correct input/weight elements. + */ + execute_window_loop(window_out, [&](const Coordinates & id) + { + /* + * In here we create theoretical indexes which then we validate for both + * inputs and weights. + * As a reminder, this loop take each output point in NHW, C is treated + * in the weights loop. + */ + // We are computing the theoretical starting input starting points + const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + + // We use the input points to select the valid weight points to use + const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w; + const int index_h_start = in_h_start - in_h_start_t; + const int index_wc_end = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w; + const int index_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + + execute_window_loop(window_w, [&](const Coordinates & id_w) + { + /* + * This is the loop in the weights, and it goes along N (the batches) + * As a reminder, the batches of the weights are translated into the + * channels of the output + */ + const T *in_ptr_row = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h; + const T *weights_ptr_row = reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h; + uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; + + T out_temp = static_cast<T>(0); + for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h) + { + const T *in_ptr_mover = in_ptr_row; + int index_wc = index_wc_start; + vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); + for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_mover); + const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wc); + out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); + } + out_temp += vreduce(out_temp_vec); + for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover) + { + const auto src_val = *(in_ptr_mover); + const auto w_val = *(weights_ptr_row + index_wc); + out_temp += src_val * w_val; + } + } + *(reinterpret_cast<T *>(out_ptr)) = out_temp; + }, + wei); + }, + out); +} + +template <typename T> +void CpuDirectConv2dKernel::convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst) +{ + // Declare useful types + using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>; + using vector_type = typename vtype::type; + using tag_type = typename vtype::tag_type; + + // Scalar quantities + const int element_size = src->info()->element_size(); + const int input_stride_w = src->info()->strides_in_bytes().y() / element_size; + const int input_stride_h = src->info()->strides_in_bytes().z() / element_size; + const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size; + const int input_dim_w = src->info()->dimension(1); + const int input_dim_h = src->info()->dimension(2); + + const int output_stride_c = dst->info()->strides_in_bytes().x(); + + const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size; + const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size; + const int kernel_dim_w = weights->info()->dimension(1); + const int kernel_dim_h = weights->info()->dimension(2); + + const int conv_pad_top = _conv_info.pad_top(); + const int conv_pad_left = _conv_info.pad_left(); + const int conv_stride_w = std::get<0>(_conv_info.stride()); + const int conv_stride_h = std::get<1>(_conv_info.stride()); + + // Setup input window for the output iterator + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Setup input window for the weights iterator + Window window_w = calculate_max_window(*weights->info(), Steps()); + window_w.set(Window::DimX, Window::Dimension(0, 1, 1)); + window_w.set(Window::DimY, Window::Dimension(0, 1, 1)); + window_w.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + Iterator out(dst, window_out); + Iterator wei(weights, window_w); + + constexpr int num_elems_read_per_iteration = 16 / sizeof(T); + + execute_window_loop(window_out, [&](const Coordinates & id) + { + // We are computing the theoretical starting input starting points + const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + + // We use the input points to select the valid weight points to use + const int wei_w_start = in_w_start - in_w_start_t; + const int wei_h_start = in_h_start - in_h_start_t; + const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end); + const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + + const int index_c_end = weights->info()->dimension(0); + const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n; + + execute_window_loop(window_w, [&](const Coordinates & id_w) + { + const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr()); + uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; + + T out_temp = static_cast<T>(0); + for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h) + { + const T *const in_ptr_row = in_ptr_start + index_in_h * input_stride_h; + const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h; + for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w) + { + const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w; + const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w; + int index_c = 0; + vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); + for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_mover); + const auto w_vec = wrapper::vloadq(weights_ptr_mover); + out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); + } + out_temp += vreduce(out_temp_vec); + for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover) + { + const auto src_val = *(in_ptr_mover); + const auto w_val = *(weights_ptr_mover); + out_temp += src_val * w_val; + } + } + } + *(reinterpret_cast<T *>(out_ptr)) = out_temp; + }, + wei); + }, + out); +} + +BorderSize CpuDirectConv2dKernel::border_size() const +{ + return _border_size; +} + +void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + + _conv_info = conv_info; + _data_layout = src->data_layout(); + _kernel_size = weights->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH)); + + const unsigned int conv_pad_left = conv_info.pad_left(); + const unsigned int conv_pad_top = conv_info.pad_top(); + const unsigned int conv_pad_right = conv_info.pad_right(); + const unsigned int conv_pad_bottom = conv_info.pad_bottom(); + if(_data_layout == DataLayout::NCHW) + { + _border_size = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left); + } + else + { + _border_size = BorderSize(0); + } + + // Get convolved dimensions + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); + + DataType data_type = src->data_type(); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, output_shape, 1, data_type); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, dst, conv_info)); + + // Configure kernel window + auto win_config = validate_and_configure_window(src, weights, dst, conv_info, _num_weight_elems_read_per_row, + _num_elems_read_per_iteration, _num_elems_written_per_iteration, _border_size); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICpuKernel::configure(win_config.second); +} + +Status CpuDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info) +{ + unsigned int num_weight_elems_read_per_row = 0; + unsigned int num_elems_read_per_iteration = 0; + unsigned int num_elems_written_per_iteration = 0; + BorderSize border_size = {}; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), + weights->clone().get(), + dst->clone().get(), + conv_info, + num_weight_elems_read_per_row, + num_elems_read_per_iteration, + num_elems_written_per_iteration, + border_size) + .first); + + return Status{}; +} + +void CpuDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + const int kernel_size = weights->info()->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH)); + + if(_data_layout == DataLayout::NCHW) + { + switch(kernel_size) + { + case 1: + { + switch(src->info()->data_type()) + { + case DataType::F32: + convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info); + break; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info); + break; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + default: + ARM_COMPUTE_ERROR("Data type not supported"); + break; + } + break; + } + case 3: + { + switch(src->info()->data_type()) + { + case DataType::F32: + convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info); + break; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info); + break; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + default: + ARM_COMPUTE_ERROR("Data type not supported"); + break; + } + break; + } + case 5: + { + switch(src->info()->data_type()) + { + case DataType::F32: + convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info); + break; + default: + ARM_COMPUTE_ERROR("Data type not supported"); + break; + } + break; + } + default: + { + ARM_COMPUTE_ERROR("Only kernel sizes 1x1, 3x3 and 5x5 are supported."); + break; + } + } + } + else + { + switch(src->info()->data_type()) + { + case DataType::F32: + { + if(have_zero_x_internal_padding(src->info(), weights->info())) + { + convolve_nhwc_optimized<float>(window, src, weights, dst); + } + else + { + convolve_nhwc<float>(window, src, weights, dst); + } + break; + } + default: + ARM_COMPUTE_ERROR("Data type not supported"); + break; + } + } +} +const char *CpuDirectConv2dKernel::name() const +{ + return "CpuDirectConvolutionLayerKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.h b/src/cpu/kernels/CpuDirectConv2dKernel.h new file mode 100644 index 0000000000..3ba7f7ed5f --- /dev/null +++ b/src/cpu/kernels/CpuDirectConv2dKernel.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DIRECT_CONV2D_KERNEL_H +#define ARM_COMPUTE_CPU_DIRECT_CONV2D_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the kernel to perform Direct Convolution Layer. */ +class CpuDirectConv2dKernel : public ICpuKernel +{ +public: + CpuDirectConv2dKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dKernel); + /** Set the src, weights, and dst tensors. + * + * @note: DirectConvolution only works in the following configurations: + * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 + * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 + * + * @param[in] src The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * The 3rd dimension must be the same as the input's volume 3rd dimension. + * Data type supported:Same as @p input. + * @param[out] dst Output tensor. + * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32 + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + */ + void configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDirectConv2dKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + BorderSize border_size() const override; + +private: + /* Template function for optimized convolution NHWC */ + template <typename T> + void convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst); + + /* Template function for convolution NHWC */ + template <typename T> + void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst); + + PadStrideInfo _conv_info{}; + BorderSize _border_size{}; + unsigned int _kernel_size{ 0 }; + unsigned int _num_weight_elems_read_per_row{ 0 }; + unsigned int _num_elems_read_per_iteration{ 0 }; + unsigned int _num_elems_written_per_iteration{ 0 }; + DataLayout _data_layout{ DataLayout::UNKNOWN }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /*ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H */ diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp new file mode 100644 index 0000000000..93ad5e5eba --- /dev/null +++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp @@ -0,0 +1,513 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <arm_neon.h> +#include <cstddef> +#include <cstdint> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, + const DirectConvolutionLayerOutputStageKernelInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::S32, DataType::F32); + + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); + ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL))); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + } + + if(src->data_type() == DataType::S32) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst == nullptr, "In-place computation not allowed for quantized output"); + } + + // Checks performed when output is configured + if((dst != nullptr) && (dst->total_size() != 0)) + { + if(is_data_type_float(src->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + } + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + else if(src->data_type() == DataType::S32) + { + // In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo + ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && (info.output_data_type != DataType::QASYMM8_SIGNED)); + } + + return Status{}; +} + +template <typename T> +typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type +output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) +{ + const bool has_bias = bias != nullptr; + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + ARM_COMPUTE_ERROR_ON(src->info()->data_layout() == DataLayout::UNKNOWN); + ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier); + ARM_COMPUTE_UNUSED(result_shift); + ARM_COMPUTE_UNUSED(result_offset_after_shift); + + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 16 / src->info()->element_size(); + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win); + Iterator out(dst, win); + execute_window_loop(win, [&](const Coordinates & id) + { + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x; + auto v_in = wrapper::vloadq(in_ptr); + + // Accumulate bias + if(has_bias) + { + const auto vb = wrapper::vdup_n(*reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{}); + v_in = wrapper::vadd(v_in, vb); + } + + const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x; + wrapper::vstore(out_ptr, v_in); + } + + // Left-overs loop + for(; x < window_end_x; ++x) + { + // Get bias and pointer to input + auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x); + + // Accumulate bias + if(has_bias) + { + const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))); + s_in += b; + } + + *(reinterpret_cast<T *>(out.ptr()) + x) = s_in; + } + + }, + in, out); +} + +template <typename T> +typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type +output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) +{ + const bool has_bias = bias != nullptr; + ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier); + ARM_COMPUTE_UNUSED(result_shift); + ARM_COMPUTE_UNUSED(result_offset_after_shift); + + Window window_bias = window; + window_bias.set(Window::DimX, Window::Dimension(0, 1, 1)); + window_bias.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0)); + window_bias.set(3, Window::Dimension(0, 0, 0)); + + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 16 / src->info()->element_size(); + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win); + Iterator bi(bias, window_bias); + Iterator out(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast<const T *>(in.ptr()); + auto v_in = wrapper::vloadq(in_ptr + x); + + // Accumulate bias + if(has_bias) + { + const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x; + v_in = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr)); + } + + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); + wrapper::vstore(out_ptr + x, v_in); + } + + // Left-overs loop + for(; x < window_end_x; ++x) + { + // Get bias and pointer to input + auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x); + + // Accumulate bias + if(has_bias) + { + const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x; + s_in += *bias_ptr; + } + + const auto out_ptr = reinterpret_cast<T *>(out.ptr()); + *(out_ptr + x) = s_in; + } + }, + in, bi, out); +} + +// Quantized case +template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 > +void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) +{ + const bool has_bias = bias != nullptr; + using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>; + using TagType = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>; + + const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift); + + const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{}); + const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{}); + + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 16 / src->info()->element_size(); + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win); + Iterator out(dst, win); + + execute_window_loop(win, [&](const Coordinates & id) + { + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x; + int32x4x4_t v_in = + { + { + wrapper::vloadq(in_ptr), + wrapper::vloadq(in_ptr + 4), + wrapper::vloadq(in_ptr + 8), + wrapper::vloadq(in_ptr + 12) + } + }; + + // Accumulate bias + if(has_bias) + { + const auto vb = wrapper::vdup_n(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{}); + v_in = + { + { + wrapper::vadd(v_in.val[0], vb), + wrapper::vadd(v_in.val[1], vb), + wrapper::vadd(v_in.val[2], vb), + wrapper::vadd(v_in.val[3], vb) + } + }; + } + + const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; + wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, + min, max, false)); + } + + // Left-overs loop + for(; x < window_end_x; ++x) + { + // Get bias and pointer to input + int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); + + // Accumulate bias + if(has_bias) + { + const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))); + s_in += b; + } + + const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; + *out_ptr = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, + std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false); + } + }, + in, out); +} +template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 > +void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) +{ + const bool has_bias = bias != nullptr; + using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>; + using TagType = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>; + + const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift); + + const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{}); + const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{}); + + Window window_bias = window; + window_bias.set(Window::DimX, Window::Dimension(0, 1, 1)); + window_bias.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0)); + window_bias.set(3, Window::Dimension(0, 0, 0)); + + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 16 / src->info()->element_size(); + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win); + Iterator bi(bias, window_bias); + Iterator out(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x; + int32x4x4_t v_in = + { + { + wrapper::vloadq(in_ptr), + wrapper::vloadq(in_ptr + 4), + wrapper::vloadq(in_ptr + 8), + wrapper::vloadq(in_ptr + 12), + } + }; + + // Accumulate bias + if(has_bias) + { + const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x; + + wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr)); + wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4)); + wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8)); + wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12)); + } + + const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; + wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false)); + } + + // Left-overs loop + for(; x < window_end_x; ++x) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x; + int32_t s_in = *in_ptr; + + // Accumulate bias + if(has_bias) + { + const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x; + s_in += *bias_ptr; + } + + const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x; + *out_ptr = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, + std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false); + } + }, + in, bi, out); +} +} // namespace + +void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, + const DirectConvolutionLayerOutputStageKernelInfo &info) +{ + ARM_COMPUTE_UNUSED(bias); + // Perform validation step + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info)); + + _func = nullptr; + _result_fixedpoint_multiplier = info.result_fixedpoint_multiplier; + _result_shift = info.result_shift; + _result_offset_after_shift = info.result_offset_after_shift; + + // Auto-initialize output output if required + if(dst != nullptr) + { + // Work out expected output data type + const DataType output_dt = (src->data_type() == DataType::S32) ? info.output_data_type : DataType::S32; + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_data_type(output_dt)); + } + + Window win = calculate_max_window(*src, Steps()); + + ICpuKernel::configure(win); + + const bool is_qasymm8_signed = (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false; + + // Set appropriate function + if(src->data_layout() == DataLayout::NCHW) + { + switch(src->data_type()) + { + case DataType::S32: + { + if(is_qasymm8_signed) + { + _func = &output_stage_nchw<int8_t>; + } + else + { + _func = &output_stage_nchw<uint8_t>; + } + break; + } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + { + _func = &output_stage_nchw<float16_t>; + break; + } +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + case DataType::F32: + { + _func = &output_stage_nchw<float>; + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs."); + } + } + } + else + { + switch(src->data_type()) + { + case DataType::S32: + { + if(is_qasymm8_signed) + { + _func = &output_stage_nhwc<int8_t>; + } + else + { + _func = &output_stage_nhwc<uint8_t>; + } + break; + } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + { + _func = &output_stage_nhwc<float16_t>; + break; + } +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + case DataType::F32: + { + _func = &output_stage_nhwc<float>; + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs."); + } + } + } +} + +Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, + const DirectConvolutionLayerOutputStageKernelInfo &info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info)); + return Status{}; +} + +void CpuDirectConv2dOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + auto src = tensors.get_tensor(TensorType::ACL_SRC_0); + auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + (*_func)(src, bias, window, dst, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift); +} + +const char *CpuDirectConv2dOutputStageKernel::name() const +{ + return "CpuDirectConv2dOutputStageKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h new file mode 100644 index 0000000000..a68936bbae --- /dev/null +++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DIRECT_CONV2D_OUTPUT_STAGE_KERNEL_H +#define ARM_COMPUTE_CPU_DIRECT_CONV2D_OUTPUT_STAGE_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel to accumulate the biases, if provided, or downscale in case of quantized input. + * + * @note We assume bias to be shared + * @note For quantized computations (i.e. @p src of S32 type) the output data type for auto-initialization must be passed as part + * of the @ref DirectConvolutionLayerOutputStageKernelInfo. + */ +class CpuDirectConv2dOutputStageKernel : public ICpuKernel +{ +public: + CpuDirectConv2dOutputStageKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dOutputStageKernel); + /** Set the accumulate buffer and the biases of the kernel. + * + * @param[in, out] src Input to add the bias to. If @p dst is not specified then accumulation is done in-place. + * Data type supported: F16/F32/S32 + * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p src + * @param[out] dst (Optional) If the dst tensor is specified the accumulation is done out-of-place. (Defaults to nullptr) + * Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr. + * Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32 + * @param[in] info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata + */ + void configure(ITensorInfo *src, const ITensorInfo *bias = nullptr, ITensorInfo *dst = nullptr, + const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDirectConv2dOutputStageKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *bias = nullptr, const ITensorInfo *dst = nullptr, + const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo()); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + using OutputStageKernel = void(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst, + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift); + + OutputStageKernel *_func{ nullptr }; + int _result_fixedpoint_multiplier{ 0 }; + int _result_shift{ 0 }; + int _result_offset_after_shift{ 0 }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_DIRECT_CONV2D_OUTPUT_STAGE_KERNEL_H */ diff --git a/src/cpu/kernels/CpuElementwiseKernel.cpp b/src/cpu/kernels/CpuElementwiseKernel.cpp new file mode 100644 index 0000000000..91de24b850 --- /dev/null +++ b/src/cpu/kernels/CpuElementwiseKernel.cpp @@ -0,0 +1,454 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuElementwiseKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "src/core/CPP/Validate.h" +#include "src/core/common/Registrars.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/elementwise/neon/elementwise_list.h" +#include "src/cpu/kernels/elementwise/neon/elementwise_quantized_list.h" +#include "src/cpu/kernels/elementwise/sve/elementwise_list.h" +#include "src/cpu/kernels/elementwise/sve/elementwise_quantized_list.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +struct ElementwiseSelectorData +{ + DataType dt; + const CPUInfo &ci; +}; + +using ElementwiseSelector = std::add_pointer<bool(const ElementwiseSelectorData &)>::type; +using UKernelType = CpuElementwiseKernel::ElementwiseFunction; +struct ElementwiseKernel +{ + const char *name; + const ElementwiseSelector is_selected; + UKernelType *ukernel; +}; + +template <ArithmeticOperation op> +CpuElementwiseKernel::UKernelInfo configure_arithm_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + ARM_COMPUTE_UNUSED(src1, dst); + static ElementwiseKernel kernels[] = + { +#if defined(ARM_COMPUTE_ENABLE_SVE) + { + "sve_fp32_elementwise", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, + REGISTER_FP32_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, float32_t>)) + }, + { + "sve_s32_elementwise", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32 && data.ci.has_sve(); }, + REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, int32_t>)) + }, + { + "sve_s16_elementwise", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); }, + REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, int16_t>)) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ +#if defined(ARM_COMPUTE_ENABLE_NEON) + { + "neon_fp32_elementwise", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float, 4>>)) + }, + { + "neon_s32_elementwise", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32; }, + REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int32_t, 4>>)) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ +#if defined(ARM_COMPUTE_ENABLE_SVE2) + { + "sve2_qu8_elementwise", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); }, + REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op<op, uint8_t>)) + }, + { + "sve2_qs8_elementwise", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); }, + REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op<op, int8_t>)) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ +#if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) + { + "neon_qu8_elementwise", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_arithm_op_quantized<op>)) + }, + { + "neon_qs8_elementwise", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_arithm_op_quantized_signed<op>)) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */ +#if defined(ARM_COMPUTE_ENABLE_SVE) + { + "sve_fp16_elementwise", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, + REGISTER_FP16_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, float16_t>)) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ +#if defined(ARM_COMPUTE_ENABLE_NEON) +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + { + "neon_fp16_elementwise", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); }, + REGISTER_FP16_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float16_t, 8>>)) + }, +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ + { + "neon_s16_elementwise", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16; }, + REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int16_t, 8>>)) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ + }; + + for(const auto &uk : kernels) + { + if(uk.is_selected({ src0->data_type(), CPUInfo::get() })) + { + return { uk.name, uk.ukernel }; + } + } + + return { "", nullptr }; +} + +template <ComparisonOperation op> +CpuElementwiseKernel::UKernelInfo configure_comp_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + ARM_COMPUTE_UNUSED(src1, dst); + static ElementwiseKernel kernels[] = + { +#if defined(ARM_COMPUTE_ENABLE_SVE) + { + "sve_u8_comparison", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::U8 && data.ci.has_sve(); }, + REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, uint8_t>)) + }, + { + "sve_fp32_comparison", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, + REGISTER_FP32_SVE((arm_compute::cpu::elementwise_comparison_op<op, float>)) + }, + { + "sve_s16_comparison", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); }, + REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, int16_t>)) + }, + { + "sve_s32_comparison", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32 && data.ci.has_sve(); }, + REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, int32_t>)) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ +#if defined(ARM_COMPUTE_ENABLE_NEON) + { + "neon_u8_comparison", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::U8; }, + REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_8<op, uint8_t, uint8x16_t>)) + }, + { + "neon_fp32_comparison", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON((arm_compute::cpu::elementwise_comp_op_32<op, float, float32x4_t>)) + }, + { + "neon_s16_comparison", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::S16; }, + REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_16<op, int16_t, int16x8_t>)) + }, + { + "neon_s32_comparison", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::S32; }, + REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_32<op, int32_t, int32x4_t>)) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ +#if defined(ARM_COMPUTE_ENABLE_SVE2) + { + "sve2_qu8_comparison", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve2(); }, + REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_comparison_quantized_op<op, uint8_t>)) + }, + { + "sve2_qs8_comparison", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve2(); }, + REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_comparison_quantized_op<op, int8_t>)) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ +#if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) + { + "neon_qu8_comparison", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_comp_op_quantized<op>)) + }, + { + "neon_qs8_comparison", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_comp_op_quantized_signed<op>)) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */ +#if defined(ARM_COMPUTE_ENABLE_SVE) + { + "sve_fp16_comparison", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, + REGISTER_FP16_SVE((arm_compute::cpu::elementwise_comparison_op<op, float16_t>)) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ +#if defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + { + "neon_fp16_comparison", + [](const ElementwiseSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); }, + REGISTER_FP16_NEON((arm_compute::cpu::elementwise_comp_op_16<op, float16_t, float16x8_t>)) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ + }; + + for(const auto &uk : kernels) + { + if(uk.is_selected({ src0->data_type(), CPUInfo::get() })) + { + return { uk.name, uk.ukernel }; + } + } + + return { "", nullptr }; +} +} // namespace + +Status CpuElementwiseKernel::validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1); + + const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + + // Validate in case of configured dst + if(dst.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), + "Wrong shape for output"); + } + + return Status{}; +} + +void CpuElementwiseKernel::configure_common(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + + const auto uk = get_implementation(src0, src1, dst); + + _run_method = uk.ukernel; + _name = std::string("CpuElementwiseKernel").append("/").append(uk.name); + + // If any of shapes is dynamic, expect a configured window and dst at run-time. + if(src0->is_dynamic() || src1->is_dynamic()) + { + return; + } + + auto shape_and_window = compute_output_shape_and_window(src0->tensor_shape(), src1->tensor_shape()); + auto_init_if_empty(*dst, shape_and_window.first, 1, src0->data_type()); + ICpuKernel::configure(shape_and_window.second); +} + +void CpuElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); + + auto src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + _run_method(src0, src1, dst, window); +} + +const char *CpuElementwiseKernel::name() const +{ + return _name.c_str(); +} + +/** Arithmetic operators (min, max, squared_diff) */ +void CpuArithmeticKernel::configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst)); + _op = op; + configure_common(src0, src1, dst); +} + +Status CpuArithmeticKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32); + // Validate in case of configured dst + if(dst.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst); + } + return validate_arguments_common(src0, src1, dst); +} + +Status CpuArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +{ + ARM_COMPUTE_UNUSED(op); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst)); + return Status{}; +} + +CpuElementwiseKernel::UKernelInfo CpuArithmeticKernel::get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + switch(_op) + { + case ArithmeticOperation::MAX: + return configure_arithm_func<ArithmeticOperation::MAX>(src0, src1, dst); + case ArithmeticOperation::MIN: + return configure_arithm_func<ArithmeticOperation::MIN>(src0, src1, dst); + case ArithmeticOperation::SQUARED_DIFF: + return configure_arithm_func<ArithmeticOperation::SQUARED_DIFF>(src0, src1, dst); + case ArithmeticOperation::PRELU: + return configure_arithm_func<ArithmeticOperation::PRELU>(src0, src1, dst); + case ArithmeticOperation::DIV: + return configure_arithm_func<ArithmeticOperation::DIV>(src0, src1, dst); + case ArithmeticOperation::POWER: + return configure_arithm_func<ArithmeticOperation::POWER>(src0, src1, dst); + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return { "", nullptr }; +} + +/** The division operator */ + +void CpuDivisionKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst)); + _op = ArithmeticOperation::DIV; + configure_common(src0, src1, dst); +} + +Status CpuDivisionKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::S32, DataType::F16, DataType::F32); + return CpuArithmeticKernel::validate_arguments(src0, src1, dst); +} + +Status CpuDivisionKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst)); + return Status{}; +} + +/** The power operator */ +void CpuPowerKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst)); + _op = ArithmeticOperation::POWER; + configure_common(src0, src1, dst); +} + +Status CpuPowerKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::F16, DataType::F32); + return CpuArithmeticKernel::validate_arguments(src0, src1, dst); +} + +Status CpuPowerKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst)); + return Status{}; +} + +/** Comparison operators (equal, not equal, less than, greater than, less than or equal, greater than or equal) */ +void CpuComparisonKernel::configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst)); + _op = op; + configure_common(src0, src1, dst); +} + +Status CpuComparisonKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32); + // Validate in case of configured dst + if(dst.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::U8); + } + return validate_arguments_common(src0, src1, dst); +} + +Status CpuComparisonKernel::validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +{ + ARM_COMPUTE_UNUSED(op); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst)); + return Status{}; +} + +CpuElementwiseKernel::UKernelInfo CpuComparisonKernel::get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + switch(_op) + { + case ComparisonOperation::Equal: + return configure_comp_func<ComparisonOperation::Equal>(src0, src1, dst); + case ComparisonOperation::NotEqual: + return configure_comp_func<ComparisonOperation::NotEqual>(src0, src1, dst); + case ComparisonOperation::Greater: + return configure_comp_func<ComparisonOperation::Greater>(src0, src1, dst); + case ComparisonOperation::GreaterEqual: + return configure_comp_func<ComparisonOperation::GreaterEqual>(src0, src1, dst); + case ComparisonOperation::Less: + return configure_comp_func<ComparisonOperation::Less>(src0, src1, dst); + case ComparisonOperation::LessEqual: + return configure_comp_func<ComparisonOperation::LessEqual>(src0, src1, dst); + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return { "", nullptr }; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuElementwiseKernel.h b/src/cpu/kernels/CpuElementwiseKernel.h new file mode 100644 index 0000000000..f323fe4470 --- /dev/null +++ b/src/cpu/kernels/CpuElementwiseKernel.h @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H +#define ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for an element-wise operation kernel + * + * Element-wise operation is computed by: + * @f[ dst(x,y) = OP(src0(x,y), src1(x,y))@f] + * + */ +class CpuElementwiseKernel : public ICpuKernel +{ +public: + CpuElementwiseKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuElementwiseKernel); + + using ElementwiseFunction = void(const ITensor *, const ITensor *, ITensor *, const Window &); + struct UKernelInfo + { + std::string name; + std::function<ElementwiseFunction> ukernel; + }; + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +protected: + /** Validate the argument passed to the kernel + * + * @param[in] src0 First tensor input. Data types supported: QASYMM8/S16/F16/S32/F32. + * @param[in] src1 Second tensor input. Data types supported: Same as @p src0. + * @param[in] dst Output tensor. Data types supported: Dependent on subclass. + */ + static Status validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst); + + /** Commmon configure function for element-wise operators with no additional options (e.g. Min, Max, SquaredDiff) + * + */ + void configure_common(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); + + /** Function to get the micro kernel implementation + * + * @param[in] src0 First input tensor information + * @param[in] src1 Second input tensor information + * @param[in] dst Output tensor information + * + * @return the function instance for the micro kernel + */ + virtual UKernelInfo get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) = 0; + +protected: + std::function<ElementwiseFunction> _run_method{ nullptr }; + std::string _name{}; +}; + +class CpuArithmeticKernel : public CpuElementwiseKernel +{ +public: + CpuArithmeticKernel() = default; + + /** Configure kernel + * + * @param[in] op Arithmetic operation to be executed. + * @param[in] src0 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32. + * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0. + * @param[out] dst Output tensor info. Data types supported: Same as @p src0. + */ + void configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); + + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuArithmeticKernel::configure() + * + * @return a status + */ + static Status validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); + +protected: + // Inherited methods overridden: + static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst); + + ArithmeticOperation _op{}; + +private: + /** Function to get the micro kernel implementation + * + * @param[in] src0 First input tensor information + * @param[in] src1 Second input tensor information + * @param[in] dst Output tensor information + * + * @return the function instance for the micro kernel + */ + UKernelInfo get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) override; +}; + +class CpuDivisionKernel : public CpuArithmeticKernel +{ +public: + CpuDivisionKernel() = default; + + /** Configure kernel + * + * @param[in] src0 First tensor input info. Data types supported: S32/F16/F32. + * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0. + * @param[out] dst Output tensor info. Data types supported: Same as @p src0. + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); + + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDivisionKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); + +protected: + // Inherited methods overridden: + static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst); +}; + +class CpuPowerKernel : public CpuArithmeticKernel +{ +public: + CpuPowerKernel() = default; + + /** Configure kernel + * + * @param[in] src0 First tensor input info. Data types supported: F16/F32. + * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0. + * @param[out] dst Output tensor info. Data types supported: Same as @p src0. + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); + + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuPowerKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); + +protected: + // Inherited methods overridden: + static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst); +}; + +class CpuComparisonKernel : public CpuElementwiseKernel +{ +public: + CpuComparisonKernel() = default; + + /** Configure kernel + * + * @param[in] op Comparison operation to be executed. + * @param[in] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. + * @param[in] src1 Second tensor input info. Data types supported: Same as @p src0. + * @param[out] dst Output tensor info. Data types supported: U8. + */ + void configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); + + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuComparisonKernel::configure() + * + * @return a status + */ + static Status validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); + +protected: + // Inherited methods overridden: + static Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst); + +private: + /** Function to get the micro kernel implementation + * + * @param[in] src0 First input tensor information + * @param[in] src1 Second input tensor information + * @param[in] dst Output tensor information + * + * @return the function instance for the micro kernel + */ + UKernelInfo get_implementation(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) override; + + ComparisonOperation _op{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H */
\ No newline at end of file diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp new file mode 100644 index 0000000000..c587e18850 --- /dev/null +++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuElementwiseUnaryKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Validate.h" +#include "src/core/CPP/Validate.h" +#include "src/core/common/Registrars.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/elementwise/neon/elementwise_unary_list.h" +#include "src/cpu/kernels/elementwise/sve/elementwise_unary_list.h" +#include "support/ToolchainSupport.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +struct ElementwiseUnarySelectorData +{ + DataType dt; + const CPUInfo &ci; +}; +using ElementwiseUnarySelector = std::add_pointer<bool(const ElementwiseUnarySelectorData &)>::type; + +struct ElementwiseUnaryKernel +{ + const char *name; + const ElementwiseUnarySelector is_selected; + CpuElementwiseUnaryKernel::ElementwiseUnaryUkernelPtr ukernel; +}; + +static const ElementwiseUnaryKernel available_kernels[] = +{ +#if defined(ARM_COMPUTE_ENABLE_SVE) + { + "sve_fp32_elementwise_unary", + [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::F32; }, + REGISTER_FP32_SVE(arm_compute::cpu::elementwise_sve_op<float>), + }, + { + "sve_fp16_elementwise_unary", + [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::F16; }, + REGISTER_FP16_SVE(arm_compute::cpu::elementwise_sve_op<__fp16>), + }, + { + "sve_s32_elementwise_unary", + [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::S32; }, + REGISTER_INTEGER_SVE(arm_compute::cpu::elementwise_sve_op<int32_t>), + }, +#endif // defined(ARM_COMPUTE_ENABLE_SVE) +#if defined(ARM_COMPUTE_ENABLE_NEON) + { + "neon_fp32_elementwise_unary", + [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::elementwise_op<float>), + }, +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + { + "neon_fp16_elementwise_unary", + [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::F16; }, + REGISTER_FP32_NEON(arm_compute::cpu::elementwise_op<__fp16>), + }, +#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + { + "neon_s32_elementwise_unary", + [](const ElementwiseUnarySelectorData & data) { return data.dt == DataType::S32; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::elementwise_op<int32_t>), + }, +#endif // defined(ARM_COMPUTE_ENABLE_NEON) +}; + +const ElementwiseUnaryKernel *get_implementation(DataType dt) +{ + for(const auto &uk : available_kernels) + { + if(uk.is_selected({ dt, CPUInfo::get() })) + { + return &uk; + } + } + return nullptr; +} +} // namespace + +void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(op, src, dst)); + const auto uk = get_implementation(src.data_type()); + ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + _op = op; + _run_method = uk->ukernel; + _name = std::string("CpuElementwiseUnaryKernel").append("/").append(uk->name); + + // If input shape is dynamic, expect a configured window and dst at run-time. + if(src.is_dynamic()) + { + return; + } + + auto shape_and_window = compute_output_shape_and_window(src.tensor_shape()); + auto_init_if_empty(dst, shape_and_window.first, 1, src.data_type()); + ICpuKernel::configure(shape_and_window.second); +} + +Status CpuElementwiseUnaryKernel::validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src); + + const auto *uk = get_implementation(src.data_type()); + ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + switch(op) + { + case ElementWiseUnary::EXP: + case ElementWiseUnary::RSQRT: + case ElementWiseUnary::LOG: + case ElementWiseUnary::ROUND: + case ElementWiseUnary::SIN: + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32); + break; + case ElementWiseUnary::NEG: + case ElementWiseUnary::ABS: + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32); + break; + default: + ARM_COMPUTE_ERROR("ElementWiseUnary operation not supported"); + } + // Validate in case of configured dst + if(dst.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst); + } + + return Status{}; +} + +void CpuElementwiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + _run_method(src, dst, window, _op); +} + +const char *CpuElementwiseUnaryKernel::name() const +{ + return _name.c_str(); +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.h b/src/cpu/kernels/CpuElementwiseUnaryKernel.h new file mode 100644 index 0000000000..f72eddf737 --- /dev/null +++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H +#define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H + +#include "arm_compute/core/Types.h" +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for an element-wise unary operation kernel + * + * Element-wise operation is computed by: + * @f[ dst(x) = OP(src(x))@f] + */ +class CpuElementwiseUnaryKernel : public ICpuKernel +{ +public: + CpuElementwiseUnaryKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuElementwiseUnaryKernel); + + /** Function to configure the @ref CpuElementwiseUnaryKernel + * + * @param[in] op Arithmetic operation to be executed. + * @param[in] src First tensor input. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations. + * @param[out] dst Output tensor. Data types supported: Same as @p src. + */ + void configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuElementwiseUnaryKernel::configure() + * + * @return a status + */ + static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + + /** Common signature for all the specialised elementwise unary micro-kernels + * + * @param[in] window Region on which to execute the kernel. + */ + using ElementwiseUnaryUkernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &, ElementWiseUnary)>::type; + +private: + ElementWiseUnary _op{}; + ElementwiseUnaryUkernelPtr _run_method{ nullptr }; + std::string _name{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H */ diff --git a/src/cpu/kernels/CpuFillKernel.cpp b/src/cpu/kernels/CpuFillKernel.cpp new file mode 100644 index 0000000000..f69de0082d --- /dev/null +++ b/src/cpu/kernels/CpuFillKernel.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuFillKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +void CpuFillKernel::configure(const ITensorInfo *tensor, const PixelValue &constant_value) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); + _constant_value = constant_value; + + // Configure kernel window + Window win = calculate_max_window(*tensor, Steps()); + ICpuKernel::configure(win); +} + +void CpuFillKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto inout = tensors.get_tensor(TensorType::ACL_SRC_DST); + + // Collapse all the batches on the third dimension + bool has_collapsed = true; + Window collapsed = window.collapse_if_possible(window, Window::DimZ, &has_collapsed); + ARM_COMPUTE_ERROR_ON(!has_collapsed); + + uint8_t *const start_valid_region = inout->ptr_to_element(inout->info()->valid_region().anchor); + const auto window_width = static_cast<int>(collapsed.x().end()) - static_cast<int>(collapsed.x().start()); + const size_t element_size = inout->info()->element_size(); + + // Unroll X dimension + collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator tensor_it(inout, collapsed); + execute_window_loop(collapsed, [&](const Coordinates &) + { + uint8_t *base_addr = start_valid_region + tensor_it.offset(); + // Set memory + for(int i = 0; i < window_width; ++i) + { + std::memcpy(base_addr + i * element_size, &_constant_value.value, element_size); + } + + }, + tensor_it); +} + +const char *CpuFillKernel::name() const +{ + return "CpuFillKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuFillKernel.h b/src/cpu/kernels/CpuFillKernel.h new file mode 100644 index 0000000000..3bc6a40309 --- /dev/null +++ b/src/cpu/kernels/CpuFillKernel.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_FILL_KERNEL_H +#define ARM_COMPUTE_CPU_FILL_KERNEL_H + +#include "arm_compute/core/PixelValue.h" +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel for filling a tensor with a given constant value */ +class CpuFillKernel : public ICpuKernel +{ +public: + CpuFillKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuFillKernel); + /** Configure kernel for a given list of arguments + * + * @param[in,out] tensor Tensor to fill. Supported data types: All + * @param[in] constant_value The value used to fill the planes of the tensor + */ + void configure(const ITensorInfo *tensor, const PixelValue &constant_value); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + PixelValue _constant_value{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_FILL_KERNEL_H */ diff --git a/src/cpu/kernels/CpuFloorKernel.cpp b/src/cpu/kernels/CpuFloorKernel.cpp new file mode 100644 index 0000000000..bcac1a41fc --- /dev/null +++ b/src/cpu/kernels/CpuFloorKernel.cpp @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuFloorKernel.h" + +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Validate.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "src/core/common/Registrars.h" +#include "src/cpu/kernels/floor/list.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +struct FloorSelectorData +{ + DataType dt; +}; + +using FloorSelectorPtr = std::add_pointer<bool(const FloorSelectorData &data)>::type; +using FloorUKernelPtr = std::add_pointer<void(const void *, void *, int)>::type; + +struct FloorUKernel +{ + const char *name; + const FloorSelectorPtr is_selected; + FloorUKernelPtr ukernel; +}; + +static const FloorUKernel available_kernels[] = +{ + { + "neon_fp16_floor", + [](const FloorSelectorData & data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor) + }, + { + "neon_fp32_floor", + [](const FloorSelectorData & data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor) + }, +}; + +/** Micro-kernel selector + * + * @param[in] data Selection data passed to help pick the appropriate micro-kernel + * + * @return A matching micro-kernel else nullptr + */ +const FloorUKernel *get_implementation(const FloorSelectorData &data) +{ + for(const auto &uk : available_kernels) + { + if(uk.is_selected(data)) + { + return &uk; + } + } + return nullptr; +} + +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + + const auto *uk = get_implementation(FloorSelectorData{ src->data_type() }); + ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + // Validate in case of configured output + if(dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + + return Status{}; +} +} // namespace + +void CpuFloorKernel::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); + + auto_init_if_empty(*dst, src->tensor_shape(), 1, src->data_type()); + + const auto *uk = get_implementation(FloorSelectorData{ src->data_type() }); + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + + _run_method = uk->ukernel; + _name = std::string("CpuFloorKernel").append("/").append(uk->name); + + // Configure kernel window + const Window win = calculate_max_window(*src, Steps()); + + ICPPKernel::configure(win); +} + +Window CpuFloorKernel::infer_window(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_UNUSED(dst); + ARM_COMPUTE_ERROR_ON(!bool(validate_arguments(src, dst))); + + Window win; + win.use_tensor_dimensions(src->tensor_shape()); + return win; +} + +Status CpuFloorKernel::validate(const ITensorInfo *input, const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output)); + return Status{}; +} + +void CpuFloorKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + ARM_COMPUTE_ERROR_ON(tensors.empty()); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); + + const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); + const auto len = static_cast<int>(window.x().end()) - static_cast<int>(window.x().start()); + + Window win{ window }; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator src_it(src, win); + Iterator dst_it(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + _run_method(src_it.ptr(), dst_it.ptr(), len); + }, + src_it, dst_it); +} + +const char *CpuFloorKernel::name() const +{ + return _name.c_str(); +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuFloorKernel.h b/src/cpu/kernels/CpuFloorKernel.h new file mode 100644 index 0000000000..ffb9658190 --- /dev/null +++ b/src/cpu/kernels/CpuFloorKernel.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_FLOOR_KERNEL_H +#define ARM_COMPUTE_CPU_FLOOR_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Cpu accelarated kernel to perform a floor operation */ +class CpuFloorKernel : public ICpuKernel +{ +public: + CpuFloorKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuFloorKernel); + /** Configure kernel for a given list of arguments + * + * @param[in] src Source tensor. Data type supported: F16/F32. + * @param[out] dst Destination tensor. Same as @p src + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuFloorKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + /** Infer execution window + * + * @param[in] src Source tensor info. Data type supported: F16/F32. + * @param[in] dst Destination tensor info. Same as @p src + * + * @return an execution Window + */ + Window infer_window(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + using FloorUKernelPtr = std::add_pointer<void(const void *, void *, int)>::type; + +private: + FloorUKernelPtr _run_method{ nullptr }; + std::string _name{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_FLOOR_KERNEL_H */ diff --git a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp new file mode 100644 index 0000000000..9fbf2d54c6 --- /dev/null +++ b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +using namespace arm_compute::misc::shape_calculator; + +void CpuGemmInterleave4x4Kernel::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // dst auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_interleaved_shape(*src))); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(CpuGemmInterleave4x4Kernel::validate(src, dst)); + + Window win = calculate_max_window(*src, Steps(1, 4)); + ICPPKernel::configure(win); +} + +Status CpuGemmInterleave4x4Kernel::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + + if(dst->total_size() != 0) + { + const TensorShape dst_shape = compute_interleaved_shape(*src); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + } + + return Status{}; +} + +void CpuGemmInterleave4x4Kernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + ARM_COMPUTE_ERROR_ON(tensors.empty()); + /* + * This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values) + * |a00 a01 a02 a03| + * |a10 a11 a12 a13| + * |a20 a21 a22 a23| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 | + * |a30 a31 a32 a33| + * + * After this operation, the dst matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ] + */ + const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); + + const size_t window_start_x = window.x().start(); + const size_t window_end_x = window.x().end(); + + const size_t in_height = src->info()->dimension(1); + const size_t in_stride = src->info()->strides_in_bytes()[1]; + + const size_t partial_y = in_height % 4; + + const size_t element_size = src->info()->element_size(); + + // Set window for the src tensor + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Set window for the dst tensor + Window win_out(window); + win_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_out.scale(Window::DimY, 0.25f); + + Iterator in(src, win); + Iterator out(dst, win_out); + + execute_window_loop(win, [&](const Coordinates & id) + { + if(id.y() + 4 <= static_cast<int>(in_height)) + { + for(size_t x = window_start_x; x < window_end_x; ++x) + { + std::memcpy(out.ptr() + (x * 4 + 0) * element_size, (in.ptr() + 0 * in_stride) + x * element_size, element_size); + std::memcpy(out.ptr() + (x * 4 + 1) * element_size, (in.ptr() + 1 * in_stride) + x * element_size, element_size); + std::memcpy(out.ptr() + (x * 4 + 2) * element_size, (in.ptr() + 2 * in_stride) + x * element_size, element_size); + std::memcpy(out.ptr() + (x * 4 + 3) * element_size, (in.ptr() + 3 * in_stride) + x * element_size, element_size); + } + } + else + { + for(size_t x = window_start_x; x < window_end_x; ++x) + { + size_t y = 0; + for(; y < partial_y; ++y) + { + std::memcpy(out.ptr() + (x * 4 + y) * element_size, (in.ptr() + y * in_stride) + x * element_size, element_size); + } + for(; y < 4; ++y) + { + std::memset(out.ptr() + (x * 4 + y) * element_size, 0, element_size); + } + } + } + }, + in, out); +} + +const char *CpuGemmInterleave4x4Kernel::name() const +{ + return "CpuGemmInterleave4x4Kernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h new file mode 100644 index 0000000000..047776bd1e --- /dev/null +++ b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMM_INTERLEAVE4x4_KERNEL_H +#define ARM_COMPUTE_CPU_GEMM_INTERLEAVE4x4_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel to interleave the elements of a matrix + * + * This function puts the values in a 4x4 block of Matrix A on the same row (Interleaved values) + * + * @f[ + * \left( \begin{array}{cccc} + * a00 & a01 & a02 & a03 \\ + * a10 & a11 & a12 & a13 \\ + * a20 & a21 & a22 & a23 \\ + * a30 & a31 & a32 & a33 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{ccccccccccccccccc} + * a00 & a10 & a20 & a30 & a01 & a11 & a21 & a31 & a02 & a12 & a22 & a32 & a03 & a13 & a23 & a33 \\ + * \end{array} \right) + * @f] + * + * After this operation, the dst matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ] + */ +class CpuGemmInterleave4x4Kernel : public ICpuKernel +{ +public: + CpuGemmInterleave4x4Kernel() = default; + /** Initialise the kernel's src and dst. + * + * @param[in] src Input tensor info. Data types supported: All + * @param[out] dst Output tensor info which stores the interleaved matrix. Data type supported: same as @p src. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmInterleave4x4Kernel + * + * Similar to @ref CpuGemmInterleave4x4Kernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMM_INTERLEAVE4x4_KERNEL_H */ diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp new file mode 100644 index 0000000000..f8bef64066 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp @@ -0,0 +1,1053 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +void inline vector_matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window) +{ + execute_window_loop(window, [&](const Coordinates & id) + { + if(id.x() > width_b) + { + return; + } + + // Note: Since the input are all positives, we can use uint32_t + // Accumulators for the block 0 + uint32x4x4_t c0 = + { + { + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0) + } + }; + + auto vec_a = reinterpret_cast<const uint8_t *>(ina.ptr()); + auto matrix_b = reinterpret_cast<const uint8_t *>(inb.ptr()); + auto vec_a_end_addr = vec_a + width_a; + + // This for loop performs 8 accumulations + for(; vec_a <= (vec_a_end_addr - 8);) + { + const uint8x8_t a00_u8 = vld1_u8(vec_a); + const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b); + const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b); + const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b); + const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b); + const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b); + const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b); + const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b); + const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b); + + // Convert a00_u8 to uint16_t and get the lower part + const uint16x4x2_t a00_u16 = + { + { + vget_low_u16(vmovl_u8(a00_u8)), + vget_high_u16(vmovl_u8(a00_u8)) + } + }; + + const uint16x4x4_t b00_u16 = + { + { + vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), + vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), + vget_high_u16(vmovl_u8(vget_high_u8(b00_u8))) + } + }; + + const uint16x4x4_t b10_u16 = + { + { + vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))), + vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))), + vget_high_u16(vmovl_u8(vget_high_u8(b10_u8))) + } + }; + + const uint16x4x4_t b20_u16 = + { + { + vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))), + vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))), + vget_high_u16(vmovl_u8(vget_high_u8(b20_u8))) + } + }; + + const uint16x4x4_t b30_u16 = + { + { + vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))), + vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))), + vget_high_u16(vmovl_u8(vget_high_u8(b30_u8))) + } + }; + + const uint16x4x4_t b40_u16 = + { + { + vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))), + vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))), + vget_high_u16(vmovl_u8(vget_high_u8(b40_u8))) + } + }; + + const uint16x4x4_t b50_u16 = + { + { + vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))), + vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))), + vget_high_u16(vmovl_u8(vget_high_u8(b50_u8))) + } + }; + + const uint16x4x4_t b60_u16 = + { + { + vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))), + vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))), + vget_high_u16(vmovl_u8(vget_high_u8(b60_u8))) + } + }; + + const uint16x4x4_t b70_u16 = + { + { + vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))), + vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))), + vget_high_u16(vmovl_u8(vget_high_u8(b70_u8))) + } + }; + + // Accumulate 0: + c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0); + c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0); + c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0); + c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0); + + // Accumulate 1: + c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1); + c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1); + c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1); + c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1); + + // Accumulate 2: + c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2); + c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2); + c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2); + c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2); + + // Accumulate 3: + c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3); + c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3); + c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3); + c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3); + + // Accumulate 4: + c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0); + c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0); + c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0); + c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0); + + // Accumulate 5: + c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1); + c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1); + c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1); + c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1); + + // Accumulate 6: + c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2); + c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2); + c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2); + c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2); + + // Accumulate 7: + c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3); + c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3); + c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3); + c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3); + + vec_a += 8; + matrix_b += 8 * stride_b; + } + + // This for loop performs the left-over accumulations + for(; vec_a < vec_a_end_addr;) + { + const uint8x8_t a00_u8 = vld1_dup_u8(vec_a); + const uint8x16_t b00_u8 = vld1q_u8(matrix_b); + + const uint16x4x4_t b00_u16 = + { + { + vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), + vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), + vget_high_u16(vmovl_u8(vget_high_u8(b00_u8))) + } + }; + + // Convert a00_u8 to uint16_t and get the lower part + const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8)); + + // Accumulate 0: + c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0); + c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0); + c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0); + c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0); + + vec_a += 1; + matrix_b += stride_b; + } + + auto vec_out = reinterpret_cast<int32_t *>(out.ptr()); + if(id.x() < (width_out - 16)) + { + vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0])); + vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1])); + vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2])); + vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3])); + } + else + { + auto left_over = width_out - id.x(); + for(auto k = 0; k < 4 && left_over; ++k) + { + for(auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(vec_out + k * 4 + j) = c0.val[k][j]; + } + } + } + }, + ina, inb, out); +} + +void inline vector_matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window) +{ + execute_window_loop(window, [&](const Coordinates & id) + { + if(id.x() > width_b) + { + return; + } + + // Accumulators for the block 0 + int32x4x4_t c0 = + { + { + vdupq_n_s32(0), + vdupq_n_s32(0), + vdupq_n_s32(0), + vdupq_n_s32(0) + } + }; + + auto vec_a = reinterpret_cast<const int8_t *>(ina.ptr()); + auto matrix_b = reinterpret_cast<const int8_t *>(inb.ptr()); + auto vec_a_end_addr = vec_a + width_a; + + // This for loop performs 8 accumulations + for(; vec_a <= (vec_a_end_addr - 8);) + { + const int8x8_t a00_s8 = vld1_s8(vec_a); + const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b); + const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b); + const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b); + const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b); + const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b); + const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b); + const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b); + const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b); + + // Convert a00_s8 to int16_t and get the lower part + const int16x4x2_t a00_s16 = + { + { + vget_low_s16(vmovl_s8(a00_s8)), + vget_high_s16(vmovl_s8(a00_s8)) + } + }; + + const int16x4x4_t b00_s16 = + { + { + vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), + vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), + vget_high_s16(vmovl_s8(vget_high_s8(b00_s8))) + } + }; + + const int16x4x4_t b10_s16 = + { + { + vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))), + vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))), + vget_high_s16(vmovl_s8(vget_high_s8(b10_s8))) + } + }; + + const int16x4x4_t b20_s16 = + { + { + vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))), + vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))), + vget_high_s16(vmovl_s8(vget_high_s8(b20_s8))) + } + }; + + const int16x4x4_t b30_s16 = + { + { + vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))), + vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))), + vget_high_s16(vmovl_s8(vget_high_s8(b30_s8))) + } + }; + + const int16x4x4_t b40_s16 = + { + { + vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))), + vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))), + vget_high_s16(vmovl_s8(vget_high_s8(b40_s8))) + } + }; + + const int16x4x4_t b50_s16 = + { + { + vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))), + vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))), + vget_high_s16(vmovl_s8(vget_high_s8(b50_s8))) + } + }; + + const int16x4x4_t b60_s16 = + { + { + vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))), + vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))), + vget_high_s16(vmovl_s8(vget_high_s8(b60_s8))) + } + }; + + const int16x4x4_t b70_s16 = + { + { + vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))), + vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))), + vget_high_s16(vmovl_s8(vget_high_s8(b70_s8))) + } + }; + + // Accumulate 0: + c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0); + c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0); + c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0); + c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0); + + // Accumulate 1: + c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1); + c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1); + c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1); + c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1); + + // Accumulate 2: + c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2); + c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2); + c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2); + c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2); + + // Accumulate 3: + c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3); + c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3); + c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3); + c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3); + + // Accumulate 4: + c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0); + c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0); + c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0); + c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0); + + // Accumulate 5: + c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1); + c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1); + c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1); + c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1); + + // Accumulate 6: + c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2); + c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2); + c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2); + c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2); + + // Accumulate 7: + c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3); + c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3); + c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3); + c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3); + + vec_a += 8; + matrix_b += 8 * stride_b; + } + + // This for loop performs the left-over accumulations + for(; vec_a < vec_a_end_addr;) + { + const int8x8_t a00_s8 = vld1_dup_s8(vec_a); + const int8x16_t b00_s8 = vld1q_s8(matrix_b); + + const int16x4x4_t b00_s16 = + { + { + vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), + vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), + vget_high_s16(vmovl_s8(vget_high_s8(b00_s8))) + } + }; + + // Convert a00_s8 to uint16_t and get the lower part + const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8)); + + // Accumulate 0: + c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0); + c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0); + c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0); + c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0); + + vec_a += 1; + matrix_b += stride_b; + } + + auto vec_out = reinterpret_cast<int32_t *>(out.ptr()); + if(id.x() < (width_out - 16)) + { + vst1q_s32(vec_out + 0, c0.val[0]); + vst1q_s32(vec_out + 4, c0.val[1]); + vst1q_s32(vec_out + 8, c0.val[2]); + vst1q_s32(vec_out + 12, c0.val[3]); + } + else + { + auto left_over = width_out - id.x(); + for(auto k = 0; k < 4 && left_over; ++k) + { + for(auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(vec_out + k * 4 + j) = c0.val[k][j]; + } + } + } + }, + ina, inb, out); +} + +void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window) +{ + const auto width_out = static_cast<int>(out_info.dimension(0)); + const auto height_out = static_cast<int>(out_info.dimension(1)); + const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size(); + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8_t *mtx_a0 = ina.ptr(); + const uint8_t *mtx_b0 = inb.ptr(); + + // Note: Since the input are all positives, we can use uint32_t + // Accumulators for the block 0 + uint32x4x4_t c0 = + { + { + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0) + } + }; + + // Accumulators for the block 1 + uint32x4x4_t c1 = + { + { + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0) + } + }; + + // Accumulators for the block 2 + uint32x4x4_t c2 = + { + { + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0) + } + }; + + // Accumulators for the block 3 + uint32x4x4_t c3 = + { + { + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0), + vdupq_n_u32(0) + } + }; + + for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16) + { + const uint8x8_t a00_u8 = vld1_u8(mtx_a0); + const uint8x16_t b00_u8 = vld1q_u8(mtx_b0); + + // Convert a00_u8 to uint16_t and get the lower part + const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8)); + + // Convert b00_s8 to uint16_t + const uint16x4x4_t b00_u16 = + { + { + vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), + vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))), + vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), + vget_high_u16(vmovl_u8(vget_high_u8(b00_u8))) + } + }; + + // 4x4 block 0 + c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0); + c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0); + c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0); + c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0); + + // 4x4 block 1 + c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1); + c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1); + c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1); + c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1); + + // 4x4 block 2 + c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2); + c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2); + c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2); + c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2); + + // 4x4 block 3 + c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3); + c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3); + c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3); + c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3); + } + + auto mtx_out = reinterpret_cast<int32_t *>(out.ptr()); + + if(id.y() < height_out && id.x() < (width_out - 16)) + { + vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0])); + vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1])); + vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2])); + vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3])); + if(id.y() + 1 < height_out) + { + vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0])); + vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1])); + vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2])); + vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3])); + if(id.y() + 2 < height_out) + { + vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0])); + vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1])); + vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2])); + vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3])); + if(id.y() + 3 < height_out) + { + vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0])); + vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1])); + vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2])); + vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3])); + } + } + } + } + else + { + const auto left_over_value = width_out - id.x(); + auto left_over = left_over_value; + for(auto k = 0; k < 4 && left_over; ++k) + { + for(auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(mtx_out + k * 4 + j) = c0.val[k][j]; + } + } + if(id.y() + 1 < height_out) + { + left_over = left_over_value; + for(auto k = 0; k < 4 && left_over; ++k) + { + for(auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j]; + } + } + if(id.y() + 2 < height_out) + { + left_over = left_over_value; + for(auto k = 0; k < 4 && left_over; ++k) + { + for(auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j]; + } + } + if(id.y() + 3 < height_out) + { + left_over = left_over_value; + for(auto k = 0; k < 4 && left_over; ++k) + { + for(auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j]; + } + } + } + } + } + } + }, + ina, inb, out); +} + +void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window) +{ + const auto width_out = static_cast<int>(out_info.dimension(0)); + const auto height_out = static_cast<int>(out_info.dimension(1)); + const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size(); + // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW + // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration + // All the values needed for computing a single 4x4 block will be read from consecutive memory positions + execute_window_loop(window, [&](const Coordinates & id) + { + auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr()); + auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr()); + + // Note: Since the input are all positives, we can use uint32_t + // Accumulators for the block 0 + int32x4x4_t c0 = + { + { + vdupq_n_s32(0), + vdupq_n_s32(0), + vdupq_n_s32(0), + vdupq_n_s32(0) + } + }; + + // Accumulators for the block 1 + int32x4x4_t c1 = + { + { + vdupq_n_s32(0), + vdupq_n_s32(0), + vdupq_n_s32(0), + vdupq_n_s32(0) + } + }; + + // Accumulators for the block 2 + int32x4x4_t c2 = + { + { + vdupq_n_s32(0), + vdupq_n_s32(0), + vdupq_n_s32(0), + vdupq_n_s32(0) + } + }; + + // Accumulators for the block 3 + int32x4x4_t c3 = + { + { + vdupq_n_s32(0), + vdupq_n_s32(0), + vdupq_n_s32(0), + vdupq_n_s32(0) + } + }; + + for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16) + { + const int8x8_t a00_s8 = vld1_s8(mtx_a0); + const int8x16_t b00_s8 = vld1q_s8(mtx_b0); + + // Convert a00_s8 to uint16_t and get the lower part + const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8)); + + // Convert b00_s8 to int16_t + const int16x4x4_t b00_s16 = + { + { + vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), + vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))), + vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), + vget_high_s16(vmovl_s8(vget_high_s8(b00_s8))) + } + }; + + // 4x4 block 0 + c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0); + c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0); + c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0); + c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0); + + // 4x4 block 1 + c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1); + c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1); + c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1); + c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1); + + // 4x4 block 2 + c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2); + c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2); + c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2); + c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2); + + // 4x4 block 3 + c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3); + c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3); + c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3); + c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3); + } + auto mtx_out = reinterpret_cast<int32_t *>(out.ptr()); + if(id.y() < height_out && id.x() < (width_out - 16)) + { + vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]); + vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]); + vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]); + vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]); + if(id.y() + 1 < height_out) + { + vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]); + vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]); + vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]); + vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]); + if(id.y() + 2 < height_out) + { + vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]); + vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]); + vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]); + vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]); + if(id.y() + 3 < height_out) + { + vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]); + vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]); + vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]); + vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]); + } + } + } + } + else if(id.y() < height_out) + { + const auto left_over_value = width_out - id.x(); + auto left_over = left_over_value; + for(auto k = 0; k < 4 && left_over; ++k) + { + for(auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(mtx_out + k * 4 + j) = c0.val[k][j]; + } + } + if(id.y() + 1 < height_out) + { + left_over = left_over_value; + for(auto k = 0; k < 4 && left_over; ++k) + { + for(auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j]; + } + } + if(id.y() + 2 < height_out) + { + left_over = left_over_value; + for(auto k = 0; k < 4 && left_over; ++k) + { + for(auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j]; + } + } + if(id.y() + 3 < height_out) + { + left_over = left_over_value; + for(auto k = 0; k < 4 && left_over; ++k) + { + for(auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j]; + } + } + } + } + } + } + + }, + ina, inb, out); +} + +Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); + + TensorShape in0_shape = src0->tensor_shape(); + TensorShape in1_shape = src1->tensor_shape(); + TensorShape out_shape = dst->tensor_shape(); + + // Check vector-by-matrix case + if(out_shape[1] == 1) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1], "The number of input0's columns must be equal to input1's rows"); + } + else + { + in0_shape.collapse(2); + in1_shape.collapse(2); + out_shape.collapse(2); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[0] % 16, "Input1's width must be a multiple of 16"); + } + + return Status{}; +} +} // namespace + +void CpuGemmLowpMatrixMultiplyKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + ARM_COMPUTE_UNUSED(src0); + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst)); + + TensorShape in1_shape = src1->tensor_shape(); + in1_shape.collapse(2); + + _slide_matrix_b = in1_shape[2] != 1; + + constexpr unsigned int num_elems_processed_per_iteration_x = 16; + constexpr unsigned int num_elems_processed_per_iteration_y = 4; + + Window win; + // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication + if((dst->dimension(1) == 1)) + { + // Configure kernel window + win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x)); + } + else + { + win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + } + + ICpuKernel::configure(win); +} + +Status CpuGemmLowpMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst)); + return Status{}; +} + +void CpuGemmLowpMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path + if((dst->info()->dimension(1) == 1)) + { + const auto width_matrix_a = static_cast<int>(src0->info()->dimension(0)); + const auto width_matrix_b = static_cast<int>(src1->info()->dimension(0)); + const auto width_out = static_cast<int>(dst->info()->dimension(0)); + const auto in_b_stride = static_cast<int>(src1->info()->strides_in_bytes()[1] / data_size_from_type(src1->info()->data_type())); + + // The implementation computes 16 elements per iteration + const int window_start_x = 16 * info.thread_id; + const int window_step_x = 16 * info.num_threads; + // Make sure (window_end_x - window_start_x) is a multiple of window_step_x + const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x; + + Window win_out(window); + win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); + win_out.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Window win_a(window); + win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_a.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Window win_b; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if(src1->info()->num_dimensions() >= 3) + { + win_b = window; + } + win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); + win_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator ina(src0, win_a); + Iterator inb(src1, win_b); + Iterator out(dst, win_out); + + switch(src0->info()->data_type()) + { + case DataType::S8: + case DataType::QASYMM8_SIGNED: + { + vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window); + break; + } + case DataType::U8: + case DataType::QASYMM8: + { + vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window); + break; + } + default: + { + ARM_COMPUTE_ERROR("Not supported"); + break; + } + } + } + else + { + const size_t in_b_stride = src1->info()->strides_in_bytes()[1]; + const int width_b = src1->info()->dimension(0); + + // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix + Window win_a(window); + win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, window.y().end() / 4, 1)); + + // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the columns of the output matrix + Window win_b; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if(_slide_matrix_b) + { + win_b = window; + } + win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, in_b_stride)); + win_b.set(Window::DimY, Window::Dimension(0, 0, 0)); + + // The step x and step y for the output matrix has been already set using in configure() + Iterator ina(src0, win_a); + Iterator inb(src1, win_b); + Iterator out(dst, window); + + switch(src0->info()->data_type()) + { + case DataType::S8: + case DataType::QASYMM8_SIGNED: + { + matrix_multiply_s8(ina, inb, out, width_b, *dst->info(), window); + break; + } + case DataType::U8: + case DataType::QASYMM8: + { + matrix_multiply_u8(ina, inb, out, width_b, *dst->info(), window); + break; + } + default: + { + ARM_COMPUTE_ERROR("Not supported"); + break; + } + } + } +} + +const char *CpuGemmLowpMatrixMultiplyKernel::name() const +{ + return "CpuGemmLowpMatrixMultiplyKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h new file mode 100644 index 0000000000..083ee187ef --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_KERNEL_H +#define ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel to multiply matrices + * + * @note @ref CpuGemmLowpMatrixMultiplyKernel low precision matrix product kernel + * This kernel performs the following computation: + * + * -# Convert a values from int8 to int32 + * -# Convert b values from int8 to int32 + * -# Compute the int32 matrix product of the resulting a * b and store the result as int32 + * + */ +class CpuGemmLowpMatrixMultiplyKernel : public ICpuKernel +{ +public: + /** Default constructor */ + CpuGemmLowpMatrixMultiplyKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixMultiplyKernel); + /** Initialise the kernel's input and output. + * + * The input matrices @p src0 and @p src1 must be the output of the kernels: @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel. These two + * kernels change the layout of the original matrices to be more cache-friendly. + * + * @param[in] src0 Input tensor info containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED + * @param[in] src1 Input tensor info containing the transposed1xW Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL + * @param[out] dst Output tensor info to store the result of matrix multiplication. Data type supported: S32 + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpMatrixMultiplyKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + bool _slide_matrix_b{ true }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /*ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_KERNEL_H*/ diff --git a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp new file mode 100644 index 0000000000..534076b97c --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp @@ -0,0 +1,396 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/TensorInfo.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + + if(dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix"); + } + return Status{}; +} +Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + + if(dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix"); + } + return Status{}; +} +} // namespace + +void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(src, dst, info)); + _k = info.k; + _scalar = info.scalar; + _mul_by_scalar = info.mul_by_scalar; + + switch(src->data_type()) + { + case DataType::QASYMM8: + _func = &CpuGemmLowpMatrixAReductionKernel::run_internal<uint8_t>; + break; + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8: + case DataType::QSYMM8_PER_CHANNEL: + _func = &CpuGemmLowpMatrixAReductionKernel::run_internal<int8_t>; + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type"); + } + + // Output auto initialization if not yet initialized + auto_init_if_empty(*dst, TensorShape(src->dimension(1)), 1, DataType::S32); + + Window win = calculate_max_window(*dst, Steps(1)); + ICpuKernel::configure(win); +} + +Status CpuGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(src, dst, info)); + return Status{}; +} + +template <typename T> +void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor *src, ITensor *dst, const arm_compute::Window &window) +{ + // Intermediate and final accumulator types + using TIAcc = wrapper::traits::promote_t<T>; + using TAcc = wrapper::traits::promote_t<TIAcc>; + + Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY); + + Window win_input(collapsed_window); + win_input.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_input.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_input.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Iterator in(src, win_input); + Iterator out(dst, collapsed_window); + + execute_window_loop(collapsed_window, [&](const Coordinates & id) + { + auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}); + TAcc sum_row = 0; + + const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * src->info()->strides_in_bytes()[1] + id.y() * src->info()->strides_in_bytes()[2])); + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a)); +#endif /* __arm__ */ + + int i = 0; + // This for loop performs 16 accumulations + for(; i <= (_k - 16); i += 16) + { + const auto a0_d8 = wrapper::vloadq(matrix_a + i); + + // Partial accumulations in U16 + const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8)); + + // Accumulate to U32 + vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0)); + } + + // This for loop performs the leftover accumulations + for(; i < _k; ++i) + { + sum_row += static_cast<TAcc>(matrix_a[i]); + } + +#if defined(__aarch64__) + // Reduction operation available on 64 bit architectures only + sum_row += wrapper::vaddv(vsum_row); +#else // __aarch64__ + auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row)); + tmp = wrapper::vpadd(tmp, tmp); + + sum_row += wrapper::vgetlane(tmp, 0); +#endif // __aarch64__ + + // Multiply by scalar if necessary + if(_mul_by_scalar) + { + sum_row *= _scalar; + } + + *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row); + }, + in, out); +} + +void CpuGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + (this->*_func)(src, dst, window); +} + +const char *CpuGemmLowpMatrixAReductionKernel::name() const +{ + return "CpuGemmLowpMatrixAReductionKernel"; +} + +void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(src, dst, info)); + + _k = info.k; + _scalar = info.scalar; + _mul_by_scalar = info.mul_by_scalar; + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + + switch(src->data_type()) + { + case DataType::QASYMM8: + _func = &CpuGemmLowpMatrixBReductionKernel::run_internal<uint8_t>; + break; + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8: + case DataType::QSYMM8_PER_CHANNEL: + _func = &CpuGemmLowpMatrixBReductionKernel::run_internal<int8_t>; + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type"); + } + + // Output auto initialization if not yet initialized + auto_init_if_empty(*dst, TensorShape(src->dimension(0)), 1, DataType::S32); + + // Configure kernel window + Window win = calculate_max_window_horizontal(*dst, Steps(num_elems_processed_per_iteration)); + ICpuKernel::configure(win); +} + +Status CpuGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(src, dst, info)); + return Status{}; +} + +template <typename T> +void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info) +{ + // Intermediate and final accumulator types + using TIAcc = wrapper::traits::promote_t<T>; + using TAcc = wrapper::traits::promote_t<TIAcc>; + + Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY); + const auto vec_scalar = wrapper::vdup_n(static_cast<TAcc>(_scalar), wrapper::traits::vector_128_tag{}); + + const auto width_matrix_b = static_cast<int>(src->info()->dimension(0)); + const auto in_b_stride = static_cast<int>(src->info()->strides_in_bytes()[1]); + + // The implementation computes 16 elements per iteration + const int window_start_x = 16 * info.thread_id; + const int window_step_x = 16 * info.num_threads; + // Make sure (window_end_x - window_start_x) is a multiple of window_step_x + const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x; + + Window win_out(collapsed_window); + win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); + + Window win_in(win_out); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Iterator inb(src, win_in); + Iterator out(dst, win_out); + + execute_window_loop(win_out, [&](const Coordinates & id) + { + if(id.x() > width_matrix_b) + { + return; + } + + // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation + typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] = + { + wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}) + }; + + const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * src->info()->strides_in_bytes()[2]); + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b)); + asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride)); +#endif /* __arm__ */ + + int i = 0; + // This for loop performs 4 accumulations + for(; i <= (_k - 4); i += 4) + { + const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride); + const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride); + const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride); + const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride); + +#if __arm__ + asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride)); + asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride)); + asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride)); + asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride)); +#endif /* __arm__ */ + + // Partial accumulation in 16bit + typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] = + { + wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}), + wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}) + }; + + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8)); + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8)); + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8)); + tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8)); + tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8)); + + // Accumulate to 32bit + sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0])); + sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0])); + sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1])); + sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1])); + + matrix_b += 4 * in_b_stride; + } + + // This for loop perfoms the leftover accumulations + for(; i < _k; ++i) + { + const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride); + + // Convert S8 to S16 + const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2] + { + wrapper::vmovl(wrapper::vgetlow(b0_b8)), + wrapper::vmovl(wrapper::vgethigh(b0_b8)) + }; + + // Accumulate to 32bit + sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0])); + sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0])); + sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1])); + sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1])); + + matrix_b += in_b_stride; + } + + // Multiply by scalar if necessary + if(_mul_by_scalar) + { + sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar); + sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar); + sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar); + sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar); + } + + auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr()); + if(id.x() + 16 < width_matrix_b) + { + wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0])); + wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1])); + wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2])); + wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3])); + } + else + { + auto left_over = width_matrix_b - id.x(); + for(auto k = 0; k < 4 && left_over; ++k) + { + for(auto j = 0; j < 4 && left_over; ++j, --left_over) + { + *(vector_sum_col + k * 4 + j) = sum_col[k][j]; + } + } + } + }, + inb, out); +} + +void CpuGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + (this->*_func)(src, dst, window, info); +} + +const char *CpuGemmLowpMatrixBReductionKernel::name() const +{ + return "CpuGemmLowpMatrixBReductionKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h new file mode 100644 index 0000000000..918f8c89d9 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMMLOWP_REDUCTION_KERNEL_H +#define ARM_COMPUTE_CPU_GEMMLOWP_REDUCTION_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +// Forward declarations +struct GEMMLowpReductionKernelInfo; +namespace cpu +{ +namespace kernels +{ +/** Kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + */ +class CpuGemmLowpMatrixAReductionKernel : public ICpuKernel +{ +public: + /** Default constructor */ + CpuGemmLowpMatrixAReductionKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixAReductionKernel); + /** Initialise the kernel's input and output. + * + * @param[in] src Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL + * @param[out] dst Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 + * @param[in] info Kernel metadata: + * - k (num_mtx_a_cols) Number of matrix A columns + * - is_reshaped (is_interleaved4x4) True if the matrix A has been interleaved4x4 + * - scalar Scalar value to multiply each reduced row by. + * - mul_byscalar True if each reduced column must be multiplied by a scalar value. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpMatrixAReductionKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Execution of the reduction kernel specialized on the input type + * + * @param[in] src Input tensor + * @param[in] dst Output tensor + * @param[in] window Execution window + */ + template <typename T> + void run_internal(const ITensor *src, ITensor *dst, const Window &window); + + /** Common signature for all reduction functions + * + * @param[in] src Input tensor + * @param[out] dst Output tensor + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + using CpuGemmLowpMatrixAReductionKernelPtr = void (CpuGemmLowpMatrixAReductionKernel::*)(const ITensor *src, ITensor *dst, const Window &window); + + CpuGemmLowpMatrixAReductionKernelPtr _func{ nullptr }; + int32_t _k{ 0 }; + int32_t _scalar{ 0 }; + bool _mul_by_scalar{ false }; +}; + +/** Kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + */ +class CpuGemmLowpMatrixBReductionKernel : public ICpuKernel +{ +public: + /** Default constructor */ + CpuGemmLowpMatrixBReductionKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixBReductionKernel); + /** Initialise the kernel's input and output. + * + * @param[in] src Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL + * @param[out] dst Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 + * @param[in] info Kernel metadata: + * - k (num_mtx_b_rows) Number of matrix B rows. + * - is_reshaped (is_transposed1xW) True if the input tensor is transposed 1xW. + * - scalar Scalar value to multiply each reduced row by. + * - mul_byscalar True if each reduced row must be multiplied by a scalar value. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpMatrixBReductionKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Execution of the reduction kernel specialized on the input type + * + * @param[in] src Input tensor + * @param[in] dst Output tensor + * @param[in] window Execution window + * @param[in] info Thread-related information + */ + template <typename T> + void run_internal(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info); + + /** Common signature for all reduction functions + * + * @param[in] src Input tensor + * @param[out] dst Output tensor + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + using CpuGemmLowpMatrixBReductionKernelPtr = void (CpuGemmLowpMatrixBReductionKernel::*)(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info); + + CpuGemmLowpMatrixBReductionKernelPtr _func{ nullptr }; + int32_t _k{ 0 }; + int32_t _scalar{ 0 }; + bool _mul_by_scalar{ false }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMMLOWP_REDUCTION_KERNEL_H */ diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp new file mode 100644 index 0000000000..a9896772f6 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, + int32_t a_offset, int32_t b_offset) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); + + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); + } + + // If b_offset == 0, vector_sum_row can be a nullptr + if(b_offset != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); + + // Check if input is a 3D reinterpretation + const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + + // Validate input + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); + ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); + + TensorShape output_shape = mm_result->tensor_shape(); + if(output_shape.num_dimensions() > 1) + { + const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; + + TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); + vector_sum_row_shape.collapse_from(1); + output_shape.collapse_from(output_batch_idx); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], + "mm_result tensor must have the same number of batches of output tensor"); + + if(a_offset != 0) + { + TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); + vector_sum_col_shape.collapse_from(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + } + } + } + + return Status{}; +} + +void run_offset_contribution(const Window &window, + ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, + int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, bool is_gemm3d) +{ + Window collapsed_window = window.collapse_if_possible(window, Window::DimZ); + collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0; + const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1; + + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 16; + + Iterator mm_result_it(mm_result, collapsed_window); + + if((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true + { + // Set window for vector_sum_col + Window win_vector_sum_col(collapsed_window); + win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Set window for vector_sum_row + Window win_vector_sum_row(collapsed_window); + win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col); + Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row); + + const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); + + // Offset in case vector_sum_col is batched + const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; + + execute_window_loop(collapsed_window, [&](const Coordinates & id) + { + const int batch_id = id.z() / depth_input; + auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); + auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr()); + + // Compute the leftover term due to b_offset. + int32_t b_offset_term_s32 = *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input); + b_offset_term_s32 *= b_offset; + + const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Compute the leftover term due to a_offset. + int32x4x4_t a_offset_term_s32 = + { + { + vld1q_s32(vector_sum_col_ptr + x + 0), + vld1q_s32(vector_sum_col_ptr + x + 4), + vld1q_s32(vector_sum_col_ptr + x + 8), + vld1q_s32(vector_sum_col_ptr + x + 12) + } + }; + + a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); + a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); + a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); + a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); + + // Add a_offset_term_s32 and b_offset_term_s32 + int32x4x4_t offset_term_s32 = + { + { + vdupq_n_s32(k_offset), + vdupq_n_s32(k_offset), + vdupq_n_s32(k_offset), + vdupq_n_s32(k_offset) + } + }; + + offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec)); + offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec)); + offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec)); + offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec)); + + int32x4x4_t in_s32 = + { + { + vld1q_s32(mm_result_ptr + x + 0), + vld1q_s32(mm_result_ptr + x + 4), + vld1q_s32(mm_result_ptr + x + 8), + vld1q_s32(mm_result_ptr + x + 12) + } + }; + + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]); + + // Store the result with the offset contribution + vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); + vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); + vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); + vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); + } + + // Left-overs loop + for(; x < window_end_x; ++x) + { + // Compute the leftover term due to a_offset. + int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x); + + a_offset_term_s32 *= a_offset; + + // Add the offset terms to GEMM's result + // Store the result with the offset contribution + mm_result_ptr[x] += k_offset + a_offset_term_s32 + b_offset_term_s32; + } + }, + vector_sum_col_it, vector_sum_row_it, mm_result_it); + } + else if((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true + { + ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row); + + // Set window for vector_sum_row + Window win_vector_sum_row(collapsed_window); + win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_row.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row); + + const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); + + execute_window_loop(collapsed_window, [&](const Coordinates & id) + { + const int batch_id = id.z() / depth_input; + auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr()); + + // Compute the leftover term due to b_offset. + int32_t b_offset_term_s32 = *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input); + b_offset_term_s32 *= b_offset; + + const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x4_t in_s32 = + { + { + vld1q_s32(mm_result_ptr + x + 0), + vld1q_s32(mm_result_ptr + x + 4), + vld1q_s32(mm_result_ptr + x + 8), + vld1q_s32(mm_result_ptr + x + 12) + } + }; + + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32_vec); + in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32_vec); + in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32_vec); + in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32_vec); + + // Store the result with the offset contribution + vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); + vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); + vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); + vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); + } + + // Left-overs loop + for(; x < window_end_x; ++x) + { + // Add the offset terms to GEMM's result + // Store the result with the offset contribution + mm_result_ptr[x] += b_offset_term_s32; + } + }, + vector_sum_row_it, mm_result_it); + } + else if((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false + { + // Set window for vector_sum_col + Window win_vector_sum_col(collapsed_window); + win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col); + + // Offset in case vector_sum_col is batched + const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; + + execute_window_loop(collapsed_window, [&](const Coordinates & id) + { + const int batch_id = id.z() / depth_input; + auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); + auto mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Compute the leftover term due to a_offset. + int32x4x4_t a_offset_term_s32 = + { + { + vld1q_s32(vector_sum_col_ptr + x + 0), + vld1q_s32(vector_sum_col_ptr + x + 4), + vld1q_s32(vector_sum_col_ptr + x + 8), + vld1q_s32(vector_sum_col_ptr + x + 12) + } + }; + + a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); + a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); + a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); + a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); + + int32x4x4_t in_s32 = + { + { + vld1q_s32(mm_result_ptr + x + 0), + vld1q_s32(mm_result_ptr + x + 4), + vld1q_s32(mm_result_ptr + x + 8), + vld1q_s32(mm_result_ptr + x + 12) + } + }; + + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]); + + // Store the result with the offset contribution + vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]); + vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]); + vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]); + vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]); + } + + // Left-overs loop + for(; x < window_end_x; ++x) + { + // Compute the leftover term due to a_offset. + const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x); + + // Add the offset terms to GEMM's result + // Store the result with the offset contribution + mm_result_ptr[x] += a_offset_term_s32 * a_offset; + } + }, + vector_sum_col_it, mm_result_it); + } + else // false, false + { + // No offset contribution from matrix A and matrix B + return; + } +} +} // namespace + +void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset) +{ + // Perform validate step + ARM_COMPUTE_UNUSED(vector_sum_row); + ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset)); + + _a_offset = a_offset; + _b_offset = b_offset; + _k_offset = a_offset * b_offset * k; + + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0) + { + // Check if vector_sum_col_shape should be slidden or not + // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + _slide_vector_sum_col = vector_sum_col->tensor_shape().num_dimensions() > 1; + } + + // Configure kernel window + Window win = calculate_max_window(*mm_result, Steps()); + ICpuKernel::configure(win); +} + +Status CpuGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, + int32_t a_offset, int32_t b_offset) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset)); + return Status{}; +} + +void CpuGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto vector_sum_col = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto vector_sum_row = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto mm_result = tensors.get_tensor(TensorType::ACL_DST); + + // Check if input is a 3D reinterpretation + const bool reinterpret_as_3d = vector_sum_row != nullptr + && mm_result->info()->num_dimensions() > 1 + && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); + + run_offset_contribution(window, mm_result, vector_sum_col, vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, reinterpret_as_3d); +} + +const char *CpuGemmLowpOffsetContributionKernel::name() const +{ + return "CpuGemmLowpOffsetContributionKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h new file mode 100644 index 0000000000..1ec969be92 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_KERNEL_H +#define ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel used to add the offset contribution after @ref CpuGemmLowpMatrixMultiplyKernel. The computation is performed in-place + * + * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), + * and adds to it the offset contribution of matrix A and matrix B in-place. + * + * The final result is: + * + * mm_result[i][k] = mm_result[i][k] + + * (vector_sum_col[k] * a_offset) + + * (vector_sum_row[i] * b_offset) + + * (a_offset * b_offset * k) + * + */ +class CpuGemmLowpOffsetContributionKernel : public ICpuKernel +{ +public: + /** Default constructor */ + CpuGemmLowpOffsetContributionKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpOffsetContributionKernel); + /** Initialise the kernel's input and output. + * + * @param[in, out] mm_result Input tensor containing the result of @ref CpuGemmLowpMatrixMultiplyKernel. Data type supported: S32 + * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result + * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. + * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result + * @param[in] k Number of matrix A columns or Matrix B rows + * @param[in] a_offset Offset to be added to each element of the matrix A. + * @param[in] b_offset Offset to be added to each element of the matrix B. + */ + void configure(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpOffsetContributionKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + int32_t _a_offset{ 0 }; + int32_t _b_offset{ 0 }; + int32_t _k_offset{ 0 }; + bool _slide_vector_sum_col{ true }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_KERNEL_H */ diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp new file mode 100644 index 0000000000..89aa36486c --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp @@ -0,0 +1,946 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +inline int32x4x4_t load_results_input(const Iterator &mm_result_it, int32_t x) +{ + return + { + { + vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 12) + } + }; +} + +inline int32x4x4_t load(const int32_t *ptr, int32_t x) +{ + return + { + { + vld1q_s32(ptr + x + 0), + vld1q_s32(ptr + x + 4), + vld1q_s32(ptr + x + 8), + vld1q_s32(ptr + x + 12) + } + }; +} + +inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b) +{ + return + { + { + vaddq_s32(a.val[0], b), + vaddq_s32(a.val[1], b), + vaddq_s32(a.val[2], b), + vaddq_s32(a.val[3], b) + } + }; +} + +inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b) +{ + return + { + { + vaddq_s32(a.val[0], b.val[0]), + vaddq_s32(a.val[1], b.val[1]), + vaddq_s32(a.val[2], b.val[2]), + vaddq_s32(a.val[3], b.val[3]) + } + }; +} + +inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar) +{ + return + { + { + vmulq_n_s32(a.val[0], mul_scalar), + vmulq_n_s32(a.val[1], mul_scalar), + vmulq_n_s32(a.val[2], mul_scalar), + vmulq_n_s32(a.val[3], mul_scalar) + } + }; +} + +inline int32x4x4_t mul_s32(int32x4x4_t &a, const int32_t *multilpier) +{ + return + { + { + vmulq_s32(a.val[0], vld1q_s32(multilpier)), + vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)), + vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)), + vmulq_s32(a.val[3], vld1q_s32(multilpier + 12)) + } + }; +} + +inline int32x4x4_t get_a_offset(const int32_t *vector_sum_col_ptr, int32_t a_offset, int32_t x) +{ + int32x4x4_t a_offset_term_s32 = load(vector_sum_col_ptr, x); + + a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset); + a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset); + a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset); + a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset); + return a_offset_term_s32; +} + +inline int32x4_t get_b_offset(const int32_t *vector_sum_row_ptr, int32_t b_offset) +{ + int32x4_t b_offset_term_s32 = vld1q_dup_s32(vector_sum_row_ptr); + b_offset_term_s32 = vmulq_n_s32(b_offset_term_s32, b_offset); + return b_offset_term_s32; +} + +inline int32x4x4_t get_k_offset(int32_t k_offset) +{ + return + { + { + vdupq_n_s32(k_offset), + vdupq_n_s32(k_offset), + vdupq_n_s32(k_offset), + vdupq_n_s32(k_offset) + } + }; +} + +inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu) +{ + const static int32x4_t zero_s32 = vdupq_n_s32(0); + + // Shift final result (negative value shift right) + in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32); + in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32); + in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32); + in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32); + + // Saturate negative values + in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32); + in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32); + in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32); + in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); + + // Convert S32 to S16 + const int16x8x2_t in_s16 = + { + { + vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) + } + }; + + // Convert S16 to U8 + uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1])); + + if(is_bounded_relu) + { + out_u8 = vmaxq_u8(out_u8, min_u8); + out_u8 = vminq_u8(out_u8, max_u8); + } + + return out_u8; +} + +inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu) +{ + const static int32x4_t zero_s32 = vdupq_n_s32(0); + + // Shift final result (negative value shift right) + in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32); + in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32); + in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32); + in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32); + + // Saturate negative values + in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32); + in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32); + in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32); + in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); + + // Convert S32 to S16 + const int16x8x2_t in_s16 = + { + { + vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) + } + }; + + // Convert S16 to S8 + int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); + + if(is_bounded_relu) + { + out_s8 = vmaxq_s8(out_s8, min_s8); + out_s8 = vminq_s8(out_s8, max_s8); + } + + return out_s8; +} + +inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu) +{ + const static int32x4_t zero_s32 = vdupq_n_s32(0); + + // Shift final result (negative value shift right) + in_s32.val[0] = vshlq_s32(in_s32.val[0], vnegq_s32(result_shift_s32.val[0])); + in_s32.val[1] = vshlq_s32(in_s32.val[1], vnegq_s32(result_shift_s32.val[1])); + in_s32.val[2] = vshlq_s32(in_s32.val[2], vnegq_s32(result_shift_s32.val[2])); + in_s32.val[3] = vshlq_s32(in_s32.val[3], vnegq_s32(result_shift_s32.val[3])); + + // Saturate negative values + in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32); + in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32); + in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32); + in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32); + + // Convert S32 to S16 + const int16x8x2_t in_s16 = + { + { + vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) + } + }; + + // Convert S16 to S8 + int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1])); + + if(is_bounded_relu) + { + out_s8 = vmaxq_s8(out_s8, min_s8); + out_s8 = vminq_s8(out_s8, max_s8); + } + + return out_s8; +} + +template <typename T> +struct VectorTyper +{ + using stype = T; + using vtype = typename wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128>; +}; + +inline Window get_win_vector_sum(const Window &window) +{ + Window win_vector_sum(window); + win_vector_sum.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum.set(Window::DimZ, Window::Dimension(0, 0, 0)); + return win_vector_sum; +} + +inline Iterator get_vector_sum_col_it(const Window &window, const ITensor *vector_sum_col) +{ + Iterator vector_sum_col_it(vector_sum_col, get_win_vector_sum(window)); + return vector_sum_col_it; +} + +inline Iterator get_vector_sum_row_it(const Window &window, const ITensor *vector_sum_row) +{ + Window win_vector_sum_row = get_win_vector_sum(window); + win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); + Iterator vector_sum_row_it(vector_sum_row, win_vector_sum_row); + return vector_sum_row_it; +} + +inline Iterator get_bias_it(const Window &window, const ITensor *bias) +{ + Window win_bias(window); + win_bias.set(Window::DimY, Window::Dimension(0, 1, 1)); + win_bias.set(Window::DimZ, Window::Dimension(0, 1, 1)); + Iterator bias_it(bias, win_bias); + return bias_it; +} + +template <typename VT> +inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it, + const int32x4_t result_offset_s32, const int32x4_t result_shift_s32, + typename VT::vtype min_vec, typename VT::vtype max_vec, + int32_t a_offset, int32_t b_offset, int32_t k_offset, + int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound, + int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point) +{ + int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 }; + if(!is_fixed_point) + { + // Combine quantization offset with other offsets. + offset_term_s32 = add_s32(offset_term_s32, result_offset_s32); + } + if(has_a_offset && has_b_offset) + { + offset_term_s32 = add_s32(offset_term_s32, get_k_offset(k_offset)); + } + if(has_b_offset) + { + offset_term_s32 = add_s32(offset_term_s32, get_b_offset(vector_sum_row_ptr, b_offset)); + } + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x4_t in_s32 = load_results_input(mm_result_it, x); + + if(has_a_offset) + { + in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x)); + } + if(has_bias) + { + in_s32 = add_s32(in_s32, load(bias_ptr, x)); + } + if(!is_fixed_point || has_b_offset) + { + in_s32 = add_s32(in_s32, offset_term_s32); + } + if(!is_fixed_point) + { + in_s32 = mul_s32(in_s32, multiplier); + } + + if(is_fixed_point) + { + wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x), + finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec, is_bounded_relu)); + } + else + { + wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x), + finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec, is_bounded_relu)); + } + } + // Compute left-over elements + for(; x < window_end_x; ++x) + { + int32_t in_value = *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0); + + if(has_a_offset) + { + in_value += (*(vector_sum_col_ptr + x) * a_offset); + } + if(has_bias) + { + in_value += *(bias_ptr + x); + } + + if(is_fixed_point) + { + // Finalize and store the result + *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = finalize_quantization(in_value, multiplier, shift, offset, + static_cast<typename VT::stype>(min_bound), + static_cast<typename VT::stype>(max_bound), is_bounded_relu); + } + else + { + // Finalize quantization + in_value = (in_value * multiplier) >> shift; + + // Bound and store the result + if(is_bounded_relu) + { + in_value = static_cast<typename VT::stype>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value))); + } + *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = static_cast<typename VT::stype>(std::max<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::lowest()), + std::min<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::max()), in_value))); + } + } +} + +inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it, + const int32_t *result_multipliers, const int32_t *result_shifts, + const int32x4_t result_offset, int8x16_t min_s8, int8x16_t max_s8, + int32_t a_offset, int32_t offset, int32_t min_bound, int32_t max_bound, + int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point) +{ + int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 }; + if(!is_fixed_point) + { + // Combine quantization offset with other offsets. + offset_term_s32 = add_s32(offset_term_s32, result_offset); + } + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x4_t in_s32 = load_results_input(mm_result_it, x); + + if(has_a_offset) + { + in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x)); + } + if(has_bias) + { + in_s32 = add_s32(in_s32, load(bias_ptr, x)); + } + if(!is_fixed_point) + { + in_s32 = add_s32(in_s32, offset_term_s32); + in_s32 = mul_s32(in_s32, result_multipliers + x); + } + + if(is_fixed_point) + { + vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8, is_bounded_relu)); + } + else + { + vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8, is_bounded_relu)); + } + } + // Compute left-over elements + for(; x < window_end_x; ++x) + { + int32_t in_value = *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0); + + if(has_a_offset) + { + in_value += (*(vector_sum_col_ptr + x) * a_offset); + } + if(has_bias) + { + in_value += *(bias_ptr + x); + } + + if(is_fixed_point) + { + // Finalize and store the result + *(out_it.ptr() + x) = finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset, static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound), is_bounded_relu); + } + else + { + // Finalize quantization + in_value = (in_value * result_multipliers[x]) >> (-result_shifts[x]); + + // Bound and store the result + if(is_bounded_relu) + { + in_value = static_cast<int8_t>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value))); + } + *(out_it.ptr() + x) = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value))); + } + } +} + +template <typename T> +void run_offset_contribution_output_stage(const Window &window, + const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, + int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, + GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point) +{ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + using Typer = VectorTyper<T>; + + const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0; + const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1; + + const int32_t multiplier = output_stage.gemmlowp_multiplier; + const int32_t shift = output_stage.gemmlowp_shift; + const int32_t offset = output_stage.gemmlowp_offset; + const int32_t min_bound = output_stage.gemmlowp_min_bound; + const int32_t max_bound = output_stage.gemmlowp_max_bound; + + const int32x4_t result_offset_s32 = vdupq_n_s32(offset); + const int32x4_t result_shift_s32 = vdupq_n_s32(is_fixed_point ? shift : -shift); + const auto min_vec = wrapper::vdup_n(static_cast<T>(min_bound), ExactTagType{}); + const auto max_vec = wrapper::vdup_n(static_cast<T>(max_bound), ExactTagType{}); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Window collapsed_window = win.collapse_if_possible(win, Window::DimZ); + + Iterator mm_result_it(mm_result, win); + Iterator out_it(output, win); + + if((a_offset != 0) && (b_offset != 0)) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col); + ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row); + + Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col); + Iterator vector_sum_row_it = get_vector_sum_row_it(collapsed_window, vector_sum_row); + + const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); + + // Offset in case vector_sum_col is batched + const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; + + if(bias != nullptr) + { + Iterator bias_it = get_bias_it(collapsed_window, bias); + execute_window_loop(collapsed_window, [&](const Coordinates & id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); + const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input; + run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), + mm_result_it, + out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, true, true, true, is_bounded_relu, is_fixed_point); + }, + vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it); + } + else + { + execute_window_loop(collapsed_window, [&](const Coordinates & id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); + const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input; + run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, true, true, false, is_bounded_relu, is_fixed_point); + }, + vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it); + } + } + else if((a_offset == 0) && (b_offset != 0)) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row); + + Iterator vector_sum_row_it = get_vector_sum_row_it(collapsed_window, vector_sum_row); + + const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y(); + + if(bias != nullptr) + { + Iterator bias_it = get_bias_it(collapsed_window, bias); + execute_window_loop(collapsed_window, [&](const Coordinates & id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input; + run_offset_contribution_output_stage_window<Typer>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, + out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, false, true, true, is_bounded_relu, is_fixed_point); + }, + vector_sum_row_it, bias_it, mm_result_it, out_it); + } + else + { + execute_window_loop(collapsed_window, [&](const Coordinates & id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + + id.y() + (id.z() % depth_input) * height_input; + run_offset_contribution_output_stage_window<Typer>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, false, true, false, is_bounded_relu, is_fixed_point); + }, + vector_sum_row_it, mm_result_it, out_it); + } + } + else if((a_offset != 0) && (b_offset == 0)) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col); + + Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col); + + // Offset in case vector_sum_col is batched + const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; + + if(bias != nullptr) + { + Iterator bias_it = get_bias_it(collapsed_window, bias); + execute_window_loop(collapsed_window, [&](const Coordinates & id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); + run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, + out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, true, false, true, is_bounded_relu, is_fixed_point); + }, + vector_sum_col_it, bias_it, mm_result_it, out_it); + } + else + { + execute_window_loop(collapsed_window, [&](const Coordinates & id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); + run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, true, false, false, is_bounded_relu, is_fixed_point); + }, + vector_sum_col_it, mm_result_it, out_it); + } + } + else + { + if(bias != nullptr) + { + Iterator bias_it = get_bias_it(collapsed_window, bias); + execute_window_loop(collapsed_window, [&](const Coordinates &) + { + run_offset_contribution_output_stage_window<Typer>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, false, false, true, is_bounded_relu, is_fixed_point); + }, + bias_it, mm_result_it, out_it); + } + else + { + execute_window_loop(collapsed_window, [&](const Coordinates &) + { + run_offset_contribution_output_stage_window<Typer>(nullptr, nullptr, nullptr, mm_result_it, out_it, + result_offset_s32, result_shift_s32, + min_vec, max_vec, a_offset, b_offset, k_offset, + multiplier, shift, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, false, false, false, is_bounded_relu, is_fixed_point); + }, + mm_result_it, out_it); + } + return; + } +} + +void run_offset_contribution_output_stage_symm(const Window &window, + const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, + int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, + GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point) +{ + ARM_COMPUTE_UNUSED(vector_sum_row, b_offset, k_offset); + + const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1; + + const int32_t offset = output_stage.gemmlowp_offset; + const int32_t min_bound = output_stage.gemmlowp_min_bound; + const int32_t max_bound = output_stage.gemmlowp_max_bound; + + const int32_t *result_multipliers = output_stage.gemmlowp_multipliers.data(); + const int32_t *result_shifts = output_stage.gemmlowp_shifts.data(); + const int32x4_t result_offset_s32 = vdupq_n_s32(offset); + const int8x16_t min_s8 = vdupq_n_s8(static_cast<int8_t>(min_bound)); + const int8x16_t max_s8 = vdupq_n_s8(static_cast<int8_t>(max_bound)); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Window collapsed_window = win.collapse_if_possible(win, Window::DimZ); + + Iterator mm_result_it(mm_result, win); + Iterator out_it(output, win); + + if(a_offset != 0) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col); + + Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col); + + // Offset in case vector_sum_col is batched + const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0; + + if(bias != nullptr) + { + Iterator bias_it = get_bias_it(collapsed_window, bias); + execute_window_loop(collapsed_window, [&](const Coordinates & id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); + run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it, + result_multipliers, result_shifts, + result_offset_s32, min_s8, max_s8, + a_offset, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, true, true, is_bounded_relu, is_fixed_point); + }, + vector_sum_col_it, bias_it, mm_result_it, out_it); + } + else + { + execute_window_loop(collapsed_window, [&](const Coordinates & id) + { + const int batch_id = id.z() / depth_input; + const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset); + run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, nullptr, mm_result_it, out_it, + result_multipliers, result_shifts, + result_offset_s32, min_s8, max_s8, + a_offset, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, true, false, is_bounded_relu, is_fixed_point); + }, + vector_sum_col_it, mm_result_it, out_it); + } + } + else + { + if(bias != nullptr) + { + Iterator bias_it = get_bias_it(collapsed_window, bias); + execute_window_loop(collapsed_window, [&](const Coordinates &) + { + run_offset_contribution_output_stage_window_symm(nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it, + result_multipliers, result_shifts, + result_offset_s32, min_s8, max_s8, + a_offset, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, false, true, is_bounded_relu, is_fixed_point); + }, + bias_it, mm_result_it, out_it); + } + else + { + execute_window_loop(collapsed_window, [&](const Coordinates &) + { + run_offset_contribution_output_stage_window_symm(nullptr, nullptr, mm_result_it, out_it, + result_multipliers, result_shifts, + result_offset_s32, min_s8, max_s8, + a_offset, offset, min_bound, max_bound, + window_step_x, window_start_x, window_end_x, false, false, is_bounded_relu, is_fixed_point); + }, + mm_result_it, out_it); + } + return; + } +} + +Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, + int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); + if(output->data_type() != DataType::QASYMM8) + { + ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 && b_offset != 0); + } + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN && output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT); + + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0)); + } + + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); + } + + // If b_offset == 0, vector_sum_row can be a nullptr + if(b_offset != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); + + // Check if input is a 3D reinterpretation + const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + + // Validate input + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); + ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); + + TensorShape output_shape = output->tensor_shape(); + if(output_shape.num_dimensions() > 1) + { + const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; + + TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); + vector_sum_row_shape.collapse_from(1); + output_shape.collapse_from(output_batch_idx); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], + "mm_result tensor must have the same number of batches of output tensor"); + + if(a_offset != 0) + { + TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); + vector_sum_col_shape.collapse_from(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + } + } + } + + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output); + } + + return Status{}; +} +} // namespace + +void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, + int32_t k, int32_t a_offset, int32_t b_offset, + GEMMLowpOutputStageInfo output_stage) +{ + ARM_COMPUTE_UNUSED(vector_sum_row, bias); + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage)); + + _a_offset = a_offset; + _b_offset = b_offset; + _k_offset = a_offset * b_offset * k; + _output_stage = output_stage; + + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0) + { + // Check if vector_sum_col_shape should be slidden or not + // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + _slide_vector_sum_col = vector_sum_col->tensor_shape().num_dimensions() > 1; + } + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, mm_result->clone()->set_data_type(DataType::QASYMM8)); + + // Configure kernel window + Window win = calculate_max_window(*mm_result, Steps()); + + // Note: This kernel performs 16 elements per iteration. + // However, since we use a left-over for loop, we cannot have any read or write out of memory + // For this reason num_elems_processed_per_iteration is 1 and so update_window_and_padding() can be skipped + ICpuKernel::configure(win); +} + +Status CpuGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, + int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage)); + return Status{}; +} + +void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto mm_result = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto vector_sum_col = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto vector_sum_row = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_3); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + PixelValue type_min{}; + PixelValue type_max{}; + std::tie(type_min, type_max) = get_min_max(dst->info()->data_type()); + int32_t type_min_int = type_min.get<int32_t>(); + int32_t type_max_int = type_max.get<int32_t>(); + + const bool reinterpret_as_3d = vector_sum_row != nullptr + && mm_result->info()->num_dimensions() > 1 + && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x(); + + const bool is_bounded_relu = !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int); + + // Check if we need to perform fixed point requantization + const bool is_fixed_point = _output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN; + + // Check if symmetric per-channel execution + const bool is_signed = dst->info()->data_type() == DataType::QASYMM8_SIGNED; + + // Check if symmetric per-channel execution + const bool is_symm = _output_stage.is_quantized_per_channel; + + if(is_symm) + { + run_offset_contribution_output_stage_symm(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage, + reinterpret_as_3d, is_bounded_relu, is_fixed_point); + } + else + { + if(is_signed) + { + run_offset_contribution_output_stage<int8_t>(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage, + reinterpret_as_3d, is_bounded_relu, is_fixed_point); + } + else + { + run_offset_contribution_output_stage<uint8_t>(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage, + reinterpret_as_3d, is_bounded_relu, is_fixed_point); + } + } +} + +const char *CpuGemmLowpOffsetContributionOutputStageKernel::name() const +{ + return "CpuGemmLowpOffsetContributionOutputStageKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h new file mode 100644 index 0000000000..d97727dd09 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_OUTPUTSTAGE_KERNEL_H +#define ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_OUTPUTSTAGE_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel used to add the offset contribution and perform the output stage after @ref CpuGemmLowpMatrixMultiplyKernel. + * + * The computation is performed in-place + * + * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), + * and adds to it the offset contribution of matrix A and matrix B in-place. + * + * The output stage can perform either QuantizeDownInt32ToUint8Scale or QuantizeDownInt32ToUint8ScaleByFixedPoint for Uint8. + * The output stage can perform either QuantizeDownInt32ToInt8Scale or QuantizeDownInt32ToInt8ScaleByFixedPoint for Int8. + * + * For QuantizeDownInt32ToUint8Scale/QuantizeDownInt32ToInt8Scale the final result is: + * + * ((mm_result'[i][k] + result_offset) * result_mult_int) >> result_shift + * + * For QuantizeDownInt32ToUint8ScaleByFixedPoint/QuantizeDownInt32ToInt8ScaleByFixedPoint the final result is: + * + * (FixedPointMul(mm_result'[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift + * + * where FixedPointMul(x, y) is the nearest integer to the following + * mathematical expression, evaluated without overflow or intermediate rounding: + * + * (x * y) / 2^31 + * + * and mm_result'[i][k] = mm_result[i][k] + + * (vector_sum_col[k] * a_offset) + + * (vector_sum_row[i] * b_offset) + + * (a_offset * b_offset * k) + */ + +class CpuGemmLowpOffsetContributionOutputStageKernel : public ICpuKernel +{ +public: + /** Default constructor */ + CpuGemmLowpOffsetContributionOutputStageKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpOffsetContributionOutputStageKernel); + /** Initialise the kernel inputs and output. + * + * @param[in] mm_result Input tensor info containing the result of @ref CpuGemmLowpMatrixMultiplyKernel. Data type supported: S32 + * @param[in] vector_sum_col Input row-vector tensor info of sums of all the entries in each column of matrix B. + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result + * @param[in] vector_sum_row Input row-vector tensor info of sums of all the entries in each row of matrix A. + * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result. + * @param[out] dst Output tensor info containing the final quantized result. Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[in] k Number of matrix A columns or Matrix B rows + * @param[in] a_offset Offset to be added to each element of the matrix A. + * @param[in] b_offset Offset to be added to each element of the matrix B. + * @param[in] output_stage GEMMLowp output stage info, providing the type of quantization and the necessary parameters. + */ + void configure(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, int32_t k, int32_t a_offset, + int32_t b_offset, + GEMMLowpOutputStageInfo output_stage); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpOffsetContributionOutputStageKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset, + int32_t b_offset, + GEMMLowpOutputStageInfo output_stage); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Function to use for the particular tensors passed to configure() */ + int32_t _a_offset{ 0 }; + int32_t _b_offset{ 0 }; + int32_t _k_offset{ 0 }; + bool _slide_vector_sum_col{ true }; + GEMMLowpOutputStageInfo _output_stage{ GEMMLowpOutputStageInfo() }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_OUTPUTSTAGE_KERNEL_H */ diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp new file mode 100644 index 0000000000..3023d93113 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); + + ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))); + ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) + || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound); + + // Check biases if exist + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); + } + + if(dst->total_size() != 0) + { + if(dst->data_type() != output_stage->output_data_type && (output_stage->output_data_type == DataType::QASYMM8 || output_stage->output_data_type == DataType::QASYMM8_SIGNED)) + { + ARM_COMPUTE_RETURN_ERROR_MSG("Mismatching data types"); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + + return Status{}; +} + +inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int) +{ + // Add the offset terms to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32); + in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32); + in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32); + in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32); + + // Multiply by result_mult_int + in_s32.val[0] = vmulq_n_s32(in_s32.val[0], result_mult_int); + in_s32.val[1] = vmulq_n_s32(in_s32.val[1], result_mult_int); + in_s32.val[2] = vmulq_n_s32(in_s32.val[2], result_mult_int); + in_s32.val[3] = vmulq_n_s32(in_s32.val[3], result_mult_int); +} + +template <typename T> +inline typename std::enable_if<std::is_same<T, uint8_t>::value, + typename wrapper::traits::neon_vector<T, 16>::type>::type + convert_to_8bit(const int16x8x2_t in_s16) +{ + return wrapper::vcombine(wrapper::vqmovun(in_s16.val[0]), wrapper::vqmovun(in_s16.val[1])); +} + +template <typename T> +inline typename std::enable_if<std::is_same<T, int8_t>::value, + typename wrapper::traits::neon_vector<T, 16>::type>::type + convert_to_8bit(const int16x8x2_t in_s16) +{ + return wrapper::vcombine(wrapper::vqmovn(in_s16.val[0]), wrapper::vqmovn(in_s16.val[1])); +} + +template <typename T> +inline typename wrapper::traits::neon_vector<T, 16>::type finalize_quantization(int32x4x4_t &in_s32, int32x4_t result_shift_s32, typename wrapper::traits::neon_vector<T, 16>::type min, + typename wrapper::traits::neon_vector<T, 16>::type max) +{ + // Shift final result (negative value shift right) + in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32); + in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32); + in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32); + in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32); + + // Convert S32 to S16 + const int16x8x2_t in_s16 = + { + { + vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])), + vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3])) + } + }; + + // Convert S16 to S8 or U8 + typename wrapper::traits::neon_vector<T, 16>::type out = convert_to_8bit<T>(in_s16); + + out = wrapper::vmax(out, min); + out = wrapper::vmin(out, max); + + return out; +} +} // namespace + +template <typename T> +void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window) +{ + using VectorType = typename wrapper::traits::neon_vector<T, 16>::type; + + const int32x4_t result_offset_s32 = vdupq_n_s32(_output_stage->gemmlowp_offset); + const int32x4_t result_shift_s32 = vdupq_n_s32(-_output_stage->gemmlowp_shift); + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + const int clamp_min = (_is_bounded_relu) ? _output_stage->gemmlowp_min_bound : std::numeric_limits<T>::lowest(); + const int clamp_max = (_is_bounded_relu) ? _output_stage->gemmlowp_max_bound : std::numeric_limits<T>::max(); + + VectorType min = wrapper::vdup_n(static_cast<T>(clamp_min), wrapper::traits::vector_128_tag{}); + VectorType max = wrapper::vdup_n(static_cast<T>(clamp_max), wrapper::traits::vector_128_tag{}); + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win); + Iterator out(dst, win); + + if(bias != nullptr) + { + Window win_biases; + win_biases.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator bias_i(bias, win_biases); + execute_window_loop(win, [&](const Coordinates &) + { + // Compute 16 elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x4_t in_s32 = + { + { + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12) + } + }; + + const int32x4x4_t bias_s32 = + { + { + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12) + } + }; + + // Add the bias to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); + + // Add the offset terms to GEMM's result and multiply by result_mult_int + scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier); + + wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const int bias_value = *(reinterpret_cast<const int *>(bias_i.ptr()) + x); + int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x); + + // Quantize + in_value = ((in_value + bias_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift; + + // Store the result + *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max)); + } + }, + in, bias_i, out); + } + else + { + execute_window_loop(win, [&](const Coordinates &) + { + // Compute 16 elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x4_t in_s32 = + { + { + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12) + } + }; + + // Add the offset terms to GEMM's result and multiply by result_mult_int + scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier); + + wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x); + + // Quantize + in_value = ((in_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift; + + // Store the result + *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max)); + } + }, + in, out); + } +} + +void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) +{ + ARM_COMPUTE_UNUSED(bias); + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, output_stage); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type)); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, + bias, + dst, + output_stage)); + + _output_stage = output_stage; + + // Configure kernel window + Window win = calculate_max_window(*src, Steps()); + + ICpuKernel::configure(win); + + // Check if we need to clamp the result using min and max + _is_bounded_relu = ((_output_stage->gemmlowp_min_bound != _output_stage->gemmlowp_max_bound) + && !(_output_stage->gemmlowp_min_bound == std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) + && _output_stage->gemmlowp_max_bound == std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)))); + if(_output_stage->output_data_type == DataType::QASYMM8) + { + _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal<uint8_t>; + } + else if(_output_stage->output_data_type == DataType::QASYMM8_SIGNED) + { + _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal<int8_t>; + } + else + { + ARM_COMPUTE_ERROR("Data type not supported"); + } +} + +Status CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage)); + return Status{}; +} + +void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto bias = tensors.get_const_tensor(TensorType::ACL_BIAS); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + (this->*_func)(src, bias, dst, window); +} + +const char *CpuGemmLowpQuantizeDownInt32ScaleKernel::name() const +{ + return "CpuGemmLowpQuantizeDownInt32ScaleKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h new file mode 100644 index 0000000000..ae13e760ff --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H +#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +// Forward declarations +class ITensor; +namespace cpu +{ +namespace kernels +{ +/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED + * + * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. + * The following computations will be performed by the kernel: + * + * -# Add offset terms to final result + * -# Multiply each entry of result by result_mult_int + * -# Add bias to final result if bias tensor is not a nullptr + * -# Shift the int32 accumulator by result_shift + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values: + * -# -to the [0..255] range and cast to QASYMM8. + * -# -to the [-128..127] range and cast to QASYMM8_SIGNED. + * + */ +class CpuGemmLowpQuantizeDownInt32ScaleKernel : public ICpuKernel +{ +public: + CpuGemmLowpQuantizeDownInt32ScaleKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpQuantizeDownInt32ScaleKernel); + /** Initialise the kernel's input and output. + * + * @param[in] src Input tensor info. Data type supported: S32 + * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] dst Output tensor info. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[out] output_stage GEMMLowp output stage metadata. + */ + void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpQuantizeDownInt32ScaleKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Template function to run the NEGEMMLowpQuantizeDownInt32ScaleKernel + * + * @param[in] src Input tensor info + * @param[in] bias Biases tensor info + * @param[out] dst Output tensor info + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()) + */ + template <typename T> + void run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + + /** Common signature for all the specialised CpuGemmLowpQuantizeDownInt32ScaleKernel functions + * + * @param[in] src Input tensor info + * @param[in] bias Biases tensor info + * @param[out] dst Output tensor info + * @param[in] window Region on which to execute the kernel. + */ + using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ScaleKernel::*)(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + + QuantizeDownFunctionPtr _func{ nullptr }; + const GEMMLowpOutputStageInfo *_output_stage{ nullptr }; + bool _is_bounded_relu{ false }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */ diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp new file mode 100644 index 0000000000..53ca991889 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp @@ -0,0 +1,227 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/NEON/NESymm.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(min > max); + + // Check biases if exist + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); + } + + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src); + } + + return Status{}; +} +} // namespace + +template <bool is_bounded_relu> +void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window) +{ + const int16x8_t min_s16 = vdupq_n_s16(static_cast<int16_t>(_min)); + const int16x8_t max_s16 = vdupq_n_s16(static_cast<int16_t>(_max)); + + ARM_COMPUTE_UNUSED(min_s16); + ARM_COMPUTE_UNUSED(max_s16); + + const int window_step_x = 8; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win_collapsed); + Iterator out(dst, win_collapsed); + if(bias != nullptr) + { + Window win_biases; + win_biases.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator bias_i(bias, win_biases); + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + // Compute 16 elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x2_t in_s32 = + { + { + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4) + } + }; + + const int32x4x2_t bias_s32 = + { + { + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4) + } + }; + + // Add the bias to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); + + vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x, finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, min_s16, max_s16)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x); + int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); + + // Add bias + in_value += bias_value; + // Finalize and store the result + *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min), + static_cast<int16_t>(_max)); + } + }, + in, out, bias_i); + } + else + { + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + // Compute 16 elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x2_t in_s32 = + { + { + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4) + } + }; + + vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x, finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, min_s16, max_s16)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); + ARM_COMPUTE_UNUSED(in_value); + // Finalize and store the result + *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min), + static_cast<int16_t>(_max)); + } + }, + in, out); + } +} + +void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, + int min, int max) +{ + // Perform validate step + ARM_COMPUTE_UNUSED(bias, dst); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, min, max)); + + _result_fixedpoint_multiplier = result_fixedpoint_multiplier; + _result_shift = result_shift; + _min = min; + _max = max; + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*src, src->clone()->set_data_type(DataType::QSYMM16)); + // Configure kernel window + Window win_config = calculate_max_window(*src, Steps()); + ICpuKernel::configure(win_config); + + // Check if we need to clamp the result using min and max + const bool is_bounded_relu = !(min <= -32768 && max >= 32767); + _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<true> : + &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<false>; +} + +Status CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max)); + return Status{}; +} + +void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto bias = tensors.get_const_tensor(TensorType::ACL_BIAS); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + (this->*_func)(src, bias, dst, window); +} + +const char *CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::name() const +{ + return "CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h new file mode 100644 index 0000000000..53a9d34ed1 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT16_SCALEBYFIXEDPOINT_KERNEL_H +#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT16_SCALEBYFIXEDPOINT_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +// Forward declaration +class ITensor; +namespace cpu +{ +namespace kernels +{ +/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16 + * + * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QSYMM16 value. + * The following computations will be performed by the kernel: + * + * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Round to nearest division by a power-of-two using result_shift + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values to the [-32768, 32767] range and cast to QSYMM16. + * + */ +class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public ICpuKernel +{ +public: + CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel); + /** Initialise the kernel's input and output. + * + * @param[in] src Input tensor info. Data type supported: S32 + * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] dst Output tensor info. Data type supported: Data type supported: QSYMM16 + * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0. + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16. + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0. + */ + void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int min = 0, int max = 0); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Template function to run the CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel + * + * @param[in] src Input tensor info + * @param[in] bias Bias tensor info + * @param[out] dst Output tensor info + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + template <bool is_bounded_relu> + void run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + + /** Common signature for all the specialised CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel functions + * + * @param[in] src Input tensor info + * @param[in] bias Bias tensor info + * @param[out] dst Output tensor info + * @param[in] window Region on which to execute the kernel. + */ + using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::*)( + const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + + QuantizeDownFunctionPtr _func{ nullptr }; + int _result_fixedpoint_multiplier{ 0 }; + int _result_shift{ 0 }; + int _min{ 0 }; + int _max{ 0 }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT16_SCALEBYFIXEDPOINT_KERNEL_H */ diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp new file mode 100644 index 0000000000..27214dcb5a --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(min > max); + + // Check biases if exist + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); + } + + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src); + } + + return Status{}; +} +} // namespace + +template <bool is_bounded_relu> +void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window) +{ + const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift); + const int8x16_t min_s8 = vdupq_n_s8(static_cast<int8_t>(_min)); + const int8x16_t max_s8 = vdupq_n_s8(static_cast<int8_t>(_max)); + + ARM_COMPUTE_UNUSED(min_s8, max_s8); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win_collapsed); + Iterator out(dst, win_collapsed); + if(bias != nullptr) + { + Window win_biases; + win_biases.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator bias_i(bias, win_biases); + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + // Compute 16 elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x4_t in_s32 = + { + { + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12) + } + }; + + const int32x4x4_t bias_s32 = + { + { + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12) + } + }; + + // Add the bias to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); + + vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x), + finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x); + int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); + + // Add bias + in_value += bias_value; + // Finalize and store the result + *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, + static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu); + } + }, + in, out, bias_i); + } + else + { + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + // Compute 16 elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x4_t in_s32 = + { + { + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12) + } + }; + + vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x), + finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); + + // Finalize and store the result + *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, + static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu); + } + }, + in, out); + } +} + +void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, + int result_offset_after_shift, int min, int max) +{ + ARM_COMPUTE_UNUSED(bias); + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, min, max)); + + _result_fixedpoint_multiplier = result_fixedpoint_multiplier; + _result_shift = result_shift; + _result_offset_after_shift = result_offset_after_shift; + _min = min; + _max = max; + + // Output auto initialization if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_data_type(DataType::QASYMM8_SIGNED)); + + // Configure kernel window + Window win_config = calculate_max_window(*src, Steps()); + ICpuKernel::configure(win_config); + + // Check if we need to clamp the result using min and max + const bool is_bounded_relu = !(min <= -128 && max >= 127); + _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<true> : + &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<false>; +} + +Status CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, min, max)); + return Status{}; +} + +void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto bias = tensors.get_const_tensor(TensorType::ACL_BIAS); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + (this->*_func)(src, bias, dst, window); +} + +const char *CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::name() const +{ + return "CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h new file mode 100644 index 0000000000..67829e7773 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT8_SCALEBYFIXEDPOINT_KERNEL_H +#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT8_SCALEBYFIXEDPOINT_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +// Forward declaration +class ITensor; +namespace cpu +{ +namespace kernels +{ +/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8_SIGNED + * + * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8_SIGNED value. + * The following computations will be performed by the kernel: + * + * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Round to nearest division by a power-of-two using result_shift + * -# Add offset to each result + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values to the [-128..127] range and cast to QASYMM8_SIGNED. + * + */ +class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public ICpuKernel +{ +public: + CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel); + /** Initialise the kernel's input and output. + * + * @param[in] src Input tensor info. Data type supported: S32 + * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] dst Output tensor info. Data type supported: Data type supported: QASYMM8_SIGNED + * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication + * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8_SIGNED + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + */ + void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Template function to run the CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel + * + * @param[in] src Input tensor info + * @param[in] bias Bias tensor info + * @param[out] dst Output tensor info + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + template <bool is_bounded_relu> + void run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + + /** Common signature for all the specialised CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel functions + * + * @param[in] src Input tensor info + * @param[in] bias Bias tensor info + * @param[out] dst Output tensor info + * @param[in] window Region on which to execute the kernel. + */ + using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::*)( + const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + + QuantizeDownFunctionPtr _func{ nullptr }; + int _result_fixedpoint_multiplier{ 0 }; + int _result_shift{ 0 }; + int _result_offset_after_shift{ 0 }; + int _min{ 0 }; + int _max{ 0 }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT8_SCALEBYFIXEDPOINT_KERNEL_H */ diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp new file mode 100644 index 0000000000..e49fd29115 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(min > max); + + // Check biases if exist + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); + } + + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src); + } + + return Status{}; +} +} // namespace + +template <bool is_bounded_relu> +void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window) +{ + const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift); + const uint8x16_t min_u8 = vdupq_n_u8(static_cast<uint8_t>(_min)); + const uint8x16_t max_u8 = vdupq_n_u8(static_cast<uint8_t>(_max)); + + ARM_COMPUTE_UNUSED(min_u8); + ARM_COMPUTE_UNUSED(max_u8); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win_collapsed); + Iterator out(dst, win_collapsed); + if(bias != nullptr) + { + Window win_biases; + win_biases.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator bias_i(bias, win_biases); + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + // Compute 16 elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x4_t in_s32 = + { + { + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12) + } + }; + + const int32x4x4_t bias_s32 = + { + { + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12) + } + }; + + // Add the bias to GEMM's result + in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]); + in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]); + in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); + in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); + + vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x); + int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); + + // Add bias + in_value += bias_value; + // Finalize and store the result + *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max), is_bounded_relu); + } + }, + in, out, bias_i); + } + else + { + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + // Compute 16 elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + int32x4x4_t in_s32 = + { + { + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8), + vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12) + } + }; + + vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x); + + // Finalize and store the result + *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max), is_bounded_relu); + } + }, + in, out); + } +} + +void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, + int result_offset_after_shift, int min, int max) +{ + ARM_COMPUTE_UNUSED(bias); + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, min, max)); + + _result_fixedpoint_multiplier = result_fixedpoint_multiplier; + _result_shift = result_shift; + _result_offset_after_shift = result_offset_after_shift; + _min = min; + _max = max; + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_data_type(DataType::QASYMM8)); + + // Configure kernel window + auto win_config = calculate_max_window(*src, Steps()); + ICpuKernel::configure(win_config); + + // Check if we need to clamp the result using min and max + const bool is_bounded_relu = !(min <= 0 && max >= 255); + _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<true> : + &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<false>; +} + +Status CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, min, max)); + return Status{}; +} + +void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto bias = tensors.get_const_tensor(TensorType::ACL_BIAS); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + (this->*_func)(src, bias, dst, window); +} + +const char *CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::name() const +{ + return "CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h new file mode 100644 index 0000000000..b62cac4818 --- /dev/null +++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOUINT8_SCALEBYFIXEDPOINT_KERNEL_H +#define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOUINT8_SCALEBYFIXEDPOINT_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +// Forward declaration +class ITensor; +namespace cpu +{ +namespace kernels +{ +/** Kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 + * + * This kernel takes a final int32 accumulator value (the output of @ref CpuGemmLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value. + * The following computations will be performed by the kernel: + * + * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Round to nearest division by a power-of-two using result_shift + * -# Add offset to each result + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. + * + */ +class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public ICpuKernel +{ +public: + CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel); + /** Initialise the kernel's input and output. + * + * @param[in] src Input tensor info. Data type supported: S32 + * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] dst Output tensor info. Data type supported: Data type supported: QASYMM8 + * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication + * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + */ + void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Template function to run the CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel + * + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + template <bool is_bounded_relu> + void run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + + /** Common signature for all the specialised CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel functions + * + * @param[in] window Region on which to execute the kernel. + */ + using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::*)( + const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window); + + QuantizeDownFunctionPtr _func{ nullptr }; + int _result_fixedpoint_multiplier{ 0 }; + int _result_shift{ 0 }; + int _result_offset_after_shift{ 0 }; + int _min{ 0 }; + int _max{ 0 }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOUINT8_SCALEBYFIXEDPOINT_KERNEL_H */ diff --git a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp new file mode 100644 index 0000000000..81376fb029 --- /dev/null +++ b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmMatrixAdditionKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +void matrix_addition_f32(const ITensor *src, ITensor *dst, const Window &window, float beta) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + const float32x4_t beta_f32 = vdupq_n_f32(beta); + + constexpr int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win = window.collapse_if_possible(window, Window::DimZ); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win); + Iterator out(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const float *>(in.ptr()); + const auto out_ptr = reinterpret_cast<float *>(out.ptr()); + + int x = window_start_x; + for(; x < (window_end_x - window_step_x); x += window_step_x) + { + float32x4x4_t alpha_ab = vld4q_f32(out_ptr + x); + const float32x4x4_t c = vld4q_f32(in_ptr + x); + + // Multiply matrix C by its weight and accumulate + alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32); + alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32); + alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32); + alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32); + + vst4q_f32(out_ptr + x, alpha_ab); + } + + // Left-over loop + for(; x < window_end_x; ++x) + { + *(out_ptr + x) += *(in_ptr + x) * beta; + } + }, + in, out); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +void matrix_addition_f16(const ITensor *src, ITensor *dst, const Window &window, float beta) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + const float16x8_t beta_f16 = vdupq_n_f16(beta); + + constexpr int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win = window.collapse_if_possible(window, Window::DimZ); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, win); + Iterator out(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast<const float16_t *>(in.ptr()); + const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr()); + + int x = window_start_x; + for(; x < (window_end_x - window_step_x); x += window_step_x) + { + float16x8x2_t alpha_ab = vld2q_f16(out_ptr + x); + const float16x8x2_t c = vld2q_f16(in_ptr + x); + // Multiply matrix C by its weight and accumulate + alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16)); + alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16)); + + vst2q_f16(out_ptr + x, alpha_ab); + } + + // Left-over loop + for(; x < window_end_x; ++x) + { + *(out_ptr + x) += *(in_ptr + x) * static_cast<float16_t>(beta); + } + }, + in, out); +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +} // namespace + +void CpuGemmMatrixAdditionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, float beta) +{ + ARM_COMPUTE_UNUSED(dst); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(CpuGemmMatrixAdditionKernel::validate(src, dst, beta)); + + _beta = beta; + switch(src->data_type()) + { + case DataType::F32: + _func = &matrix_addition_f32; + break; + case DataType::F16: +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + _func = &matrix_addition_f16; + break; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + default: + ARM_COMPUTE_ERROR("Data type not supported"); + break; + } + + // Configure kernel window + Window win = calculate_max_window(*src, Steps()); + ICPPKernel::configure(win); +} + +Status CpuGemmMatrixAdditionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_UNUSED(beta); + + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + + if(dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + return Status{}; +} + +void CpuGemmMatrixAdditionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + ARM_COMPUTE_ERROR_ON(tensors.empty()); + + const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); + + if(_beta != 0.0f) + { + (*_func)(src, dst, window, _beta); + } +} + +const char *CpuGemmMatrixAdditionKernel::name() const +{ + return "CpuGemmMatrixAdditionKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h new file mode 100644 index 0000000000..c9798fc24c --- /dev/null +++ b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMM_MATRIX_ADDITION_KERNEL_H +#define ARM_COMPUTE_CPU_GEMM_MATRIX_ADDITION_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel to perform the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta: + * + * @note [ MTX_OUT = MTX_0 + beta * MTX_1 ] with MTX_0 and MTX_1 of the same size + * + * @note This stage is used to finalize the GEMM result and it is computed if and only if beta != 0.0. In case this kernel is used for finalizing GEMM result, we have: + * - MTX_0 = A * B * alpha, where MTX_0 is the output of @ref CpuGemmMatrixMultiplyKernel + * - MTX_1 = C + */ +class CpuGemmMatrixAdditionKernel : public ICpuKernel +{ +public: + CpuGemmMatrixAdditionKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmMatrixAdditionKernel); + /** Initialise the kernel's input and output. + * + * @note The input and output tensor must have the same dimensions + * + * @param[in] src Input tensor info (Matrix C). Data types supported: F16/F32 + * @param[in, out] dst Output tensor info. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref CpuGemmMatrixMultiplyKernel. Data type supported: the same as @p src. + * @param[in] beta Weight of matrix C + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, float beta); + /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmMatrixAdditionKernel. + * + * @note The input and output tensor must have the same dimensions + * + * Similar to @ref CpuGemmMatrixAdditionKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Common signature for all the matrix addition functions + * + * @param[in] src An input tensor. Data types supported: F16/F32 + * @param[out] dst The output tensor. Data type supported: same as @p src + * @param[in] window Region on which to execute the kernel. + * @param[in] beta Weight of matrix C + */ + using MatrixAdditionFunctionPtr = void (*)(const ITensor *src, ITensor *dst, const Window &window, float beta); + /** Matrix addition function to use for the particular tensor types passed to configure() */ + MatrixAdditionFunctionPtr _func{ nullptr }; + float _beta{ 0.f }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMM_MATRIX_ADDITION_KERNEL_H */ diff --git a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp new file mode 100644 index 0000000000..93ae90436a --- /dev/null +++ b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp @@ -0,0 +1,1174 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/utils/helpers/float_ops.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +void vector_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) +{ + const auto width_matrix_b = static_cast<int>(dst->info()->dimension(0)); + const auto in_b_stride = static_cast<int>(rhs->info()->strides_in_bytes()[1] / rhs->info()->element_size()); + const auto num_elems_vec_a = static_cast<int>(lhs->info()->dimension(0)); + + // The implementation computes 32 elements per iteration + const int window_start_x = 32 * info.thread_id; + const int window_step_x = 32 * info.num_threads; + const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x; + ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, " (window_end_x - window_start_x) must be multiple of window_step_x"); + + Window win_out(window); + win_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_out.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Window win_a(window); + win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_a.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Window win_b; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if(rhs->info()->num_dimensions() >= 3) + { + win_b = window; + } + win_b.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator ina(lhs, win_a); + Iterator inb(rhs, win_b); + Iterator out(dst, win_out); + + const bool multiply_alpha = !(helpers::float_ops::is_one(alpha)); + + const float16x8_t alpha_f16 = vdupq_n_f16(alpha); + + execute_window_loop(win_out, [&](const Coordinates &) + { + int x = window_start_x; + // Here we don't check for x lower equal than (window_end_x - window_step_x) because of + // window_end_x is computed above which may cause out-of-bound writes to the dst. + for(; x < (window_end_x - window_step_x); x += window_step_x) + { + if(x > width_matrix_b) + { + return; + } + + auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x; + + float16x8_t acc0 = vdupq_n_f16(0.f); + float16x8_t acc1 = vdupq_n_f16(0.f); + float16x8_t acc2 = vdupq_n_f16(0.f); + float16x8_t acc3 = vdupq_n_f16(0.f); + + auto vec_a = reinterpret_cast<const float16_t *>(ina.ptr()); + const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a; + for(; vec_a <= (vec_a_end_addr - 4);) + { + const float16x4_t a0l = vld1_f16(vec_a); + + float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride); + float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride); + float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride); + float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride); + float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride); + float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride); + float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride); + float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride); + + acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0)); + acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0)); + acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0)); + acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0)); + acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1)); + acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1)); + acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1)); + acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1)); + + matrix_b += 2 * in_b_stride; + + b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride); + b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride); + b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride); + b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride); + b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride); + b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride); + b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride); + b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride); + + acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2)); + acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2)); + acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2)); + acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2)); + acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3)); + acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3)); + acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3)); + acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3)); + + vec_a += 4; + matrix_b += 2 * in_b_stride; + } + + for(; vec_a < vec_a_end_addr; ++vec_a) + { + const float16_t a0 = *vec_a; + const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride); + const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride); + const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride); + const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride); + + acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0)); + acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0)); + acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0)); + acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0)); + + matrix_b += in_b_stride; + } + + // Multiply by the weight of matrix product (alpha) + if(multiply_alpha) + { + acc0 = vmulq_f16(acc0, alpha_f16); + acc1 = vmulq_f16(acc1, alpha_f16); + acc2 = vmulq_f16(acc2, alpha_f16); + acc3 = vmulq_f16(acc3, alpha_f16); + } + + auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x; + + vst1q_f16(vec_out + 0, acc0); + vst1q_f16(vec_out + 8, acc1); + vst1q_f16(vec_out + 16, acc2); + vst1q_f16(vec_out + 24, acc3); + } + + for(; x < window_end_x; ++x) + { + if(x > width_matrix_b) + { + return; + } + + auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x; + + float16x4_t vacc = vdup_n_f16(0.f); + + auto vec_a = reinterpret_cast<const float16_t *>(ina.ptr()); + const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a; + for(; vec_a <= (vec_a_end_addr - 4); vec_a += 4) + { + const float16x4_t a0l = vld1_f16(vec_a); + + const float16x4_t b_col = + { + *(matrix_b + 0 * in_b_stride), + *(matrix_b + 1 * in_b_stride), + *(matrix_b + 2 * in_b_stride), + *(matrix_b + 3 * in_b_stride), + }; + + vacc = vadd_f16(vacc, vmul_f16(a0l, b_col)); + + matrix_b += 4 * in_b_stride; + } + + float16_t acc = vget_lane_f16(vacc, 0) + vget_lane_f16(vacc, 1) + vget_lane_f16(vacc, 2) + vget_lane_f16(vacc, 3); + + for(; vec_a < vec_a_end_addr; ++vec_a) + { + const float16_t a0 = *vec_a; + const float16_t b00 = *matrix_b; + + acc += b00 * a0; + + matrix_b += in_b_stride; + } + + // Multiply by the weight of matrix product (alpha) + if(multiply_alpha) + { + acc *= static_cast<float16_t>(alpha); + } + + auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x; + + *(vec_out) = acc; + } + }, + ina, inb, out); +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) +{ + const auto width_matrix_b = static_cast<int>(dst->info()->dimension(0)); + const auto in_b_stride = static_cast<int>(rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type())); + const auto num_elems_vec_a = static_cast<int>(lhs->info()->dimension(0)); + + // The implementation computes 16 elements per iteration + const int window_start_x = 16 * info.thread_id; + const int window_step_x = 16 * info.num_threads; + // Make sure (window_end_x - window_start_x) is a multiple of window_step_x + const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x; + + Window win_out(window); + win_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_out.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Window win_a(window); + win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_a.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Window win_b; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if(rhs->info()->num_dimensions() >= 3) + { + win_b = window; + } + win_b.set(Window::DimX, Window::Dimension(0, 1, 1)); + win_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator ina(lhs, win_a); + Iterator inb(rhs, win_b); + Iterator out(dst, win_out); + + const bool multiply_alpha = !(helpers::float_ops::is_one(alpha)); + + const float32x4_t alpha_f32 = vdupq_n_f32(alpha); + + execute_window_loop(win_out, [&](const Coordinates &) + { + int x = window_start_x; + // Here we don't check for x lower equal than (window_end_x - window_step_x) because of + // window_end_x is computed above which may cause out-of-bound writes to the dst. + for(; x < (window_end_x - window_step_x); x += window_step_x) + { + if(x > width_matrix_b) + { + return; + } + + float32x4_t acc0 = vdupq_n_f32(0.f); + float32x4_t acc1 = vdupq_n_f32(0.f); + float32x4_t acc2 = vdupq_n_f32(0.f); + float32x4_t acc3 = vdupq_n_f32(0.f); + + auto vec_a = reinterpret_cast<const float *>(ina.ptr()); + auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x; + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride))); +#endif /* __arm__ */ + + auto vec_a_end_addr = vec_a + num_elems_vec_a; + for(; vec_a <= (vec_a_end_addr - 4);) + { + float32x2_t a0l = vld1_f32(vec_a); + + float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); + float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); + float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); + float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); + + float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride); + float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride); + float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride); + float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride); + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride))); +#endif /* __arm__ */ + + acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0); + acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0); + acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0); + acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0); + + acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1); + acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1); + acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1); + acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1); + + vec_a += 2; + matrix_b += 2 * in_b_stride; + + a0l = vld1_f32(vec_a); + + b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); + b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); + b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); + b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); + + b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride); + b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride); + b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride); + b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride); + + acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0); + acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0); + acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0); + acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0); + + acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1); + acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1); + acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1); + acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1); + + vec_a += 2; + matrix_b += 2 * in_b_stride; + } + + for(; vec_a < vec_a_end_addr; ++vec_a) + { + const float a0 = *vec_a; + + const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); + const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); + const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); + const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); + + acc0 = vmlaq_n_f32(acc0, b00, a0); + acc1 = vmlaq_n_f32(acc1, b01, a0); + acc2 = vmlaq_n_f32(acc2, b02, a0); + acc3 = vmlaq_n_f32(acc3, b03, a0); + + matrix_b += in_b_stride; + } + + // Multiply by the weight of matrix product (alpha) + if(multiply_alpha) + { + acc0 = vmulq_f32(acc0, alpha_f32); + acc1 = vmulq_f32(acc1, alpha_f32); + acc2 = vmulq_f32(acc2, alpha_f32); + acc3 = vmulq_f32(acc3, alpha_f32); + } + + const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x; + + vst1q_f32(vec_out + 0, acc0); + vst1q_f32(vec_out + 4, acc1); + vst1q_f32(vec_out + 8, acc2); + vst1q_f32(vec_out + 12, acc3); + } + + // Left-over loop + for(; x < window_end_x; ++x) + { + if(x > width_matrix_b) + { + return; + } + + float32x4_t vacc = vdupq_n_f32(0.f); + + auto vec_a = reinterpret_cast<const float *>(ina.ptr()); + auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x; + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride))); +#endif /* __arm__ */ + + auto vec_a_end_addr = vec_a + num_elems_vec_a; + for(; vec_a <= (vec_a_end_addr - 4); vec_a += 4) + { + const float32x4_t a0l = vld1q_f32(vec_a); + + const float32x4_t b_col = + { + *(matrix_b + 0 * in_b_stride), + *(matrix_b + 1 * in_b_stride), + *(matrix_b + 2 * in_b_stride), + *(matrix_b + 3 * in_b_stride), + }; + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride))); +#endif /* __arm__ */ + + vacc = vmlaq_f32(vacc, b_col, a0l); + + matrix_b += 4 * in_b_stride; + } + + float acc = vgetq_lane_f32(vacc, 0) + vgetq_lane_f32(vacc, 1) + vgetq_lane_f32(vacc, 2) + vgetq_lane_f32(vacc, 3); + + for(; vec_a < vec_a_end_addr; ++vec_a) + { + const float a0 = *vec_a; + + const float b00 = *matrix_b; + + acc += b00 * a0; + + matrix_b += in_b_stride; + } + + // Multiply by the weight of matrix product (alpha) + if(multiply_alpha) + { + acc *= alpha; + } + + const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x; + + *vec_out = acc; + } + }, + ina, inb, out); +} + +void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) +{ + ARM_COMPUTE_UNUSED(info); + const int out_width = static_cast<int>(dst->info()->dimension(0)); + const int out_height = static_cast<int>(dst->info()->dimension(1)); + const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()); + const size_t out_stride1 = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type()); + const size_t out_stride2 = out_stride1 * 2; + const size_t out_stride3 = out_stride1 * 3; + const int num_elems_matrix_b_x = rhs->info()->dimension(0); + + // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix + Window win_a(window); + win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1)); + + Window win_b; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if(rhs->info()->num_dimensions() >= 3) + { + win_b = window; + } + // Set step_x and step_y for matrix B. Scale by a factor of 4 the X range as the input transposed matrix A has 4 times less the cols of the dst matrix + // The step along the x direction is 2 times the in_b_stride because for each iteration we compute 2 blocks of size 4x4 + win_b.set(Window::DimX, Window::Dimension(window.x().start() / 4, window.x().end() / 4, 2 * in_b_stride)); + win_b.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator ina(lhs, win_a); + Iterator inb(rhs, win_b); + Iterator out(dst, window); + + const bool multiply_alpha = !(helpers::float_ops::is_one(alpha)); + + const float32x4_t alpha_f32 = vdupq_n_f32(alpha); + + // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW + // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration + // All the values needed for computing a single 4x4 block will be read from consecutive memory positions + execute_window_loop(window, [&](const Coordinates & id) + { + auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr()); + auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr()); + auto mtx_b1 = mtx_b0 + in_b_stride; + + float32x4_t acc00 = vdupq_n_f32(0.f); + float32x4_t acc10 = vdupq_n_f32(0.f); + float32x4_t acc20 = vdupq_n_f32(0.f); + float32x4_t acc30 = vdupq_n_f32(0.f); + + float32x4_t acc01 = vdupq_n_f32(0.f); + float32x4_t acc11 = vdupq_n_f32(0.f); + float32x4_t acc21 = vdupq_n_f32(0.f); + float32x4_t acc31 = vdupq_n_f32(0.f); + +#if __arm__ + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1))); +#endif /* __arm__ */ + + auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x; + for(; mtx_b0 <= (mtx_b0_end_addr - 32);) + { + float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0); + float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1); + float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2); + float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3); + + float32x4_t b00 = vld1q_f32(mtx_b0); + float32x4_t b10 = vld1q_f32(mtx_b1); + float32x4_t b01 = vld1q_f32(mtx_b0 + 4); + float32x4_t b11 = vld1q_f32(mtx_b1 + 4); + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1))); +#endif /* __arm__ */ + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4); + float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5); + float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6); + float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + + a0 = vld1q_dup_f32(mtx_a0 + 0); + a1 = vld1q_dup_f32(mtx_a0 + 1); + a2 = vld1q_dup_f32(mtx_a0 + 2); + a3 = vld1q_dup_f32(mtx_a0 + 3); + + b00 = vld1q_f32(mtx_b0); + b10 = vld1q_f32(mtx_b1); + b01 = vld1q_f32(mtx_b0 + 4); + b11 = vld1q_f32(mtx_b1 + 4); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + a4 = vld1q_dup_f32(mtx_a0 + 4); + a5 = vld1q_dup_f32(mtx_a0 + 5); + a6 = vld1q_dup_f32(mtx_a0 + 6); + a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + + a0 = vld1q_dup_f32(mtx_a0 + 0); + a1 = vld1q_dup_f32(mtx_a0 + 1); + a2 = vld1q_dup_f32(mtx_a0 + 2); + a3 = vld1q_dup_f32(mtx_a0 + 3); + b00 = vld1q_f32(mtx_b0); + b10 = vld1q_f32(mtx_b1); + b01 = vld1q_f32(mtx_b0 + 4); + b11 = vld1q_f32(mtx_b1 + 4); + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1))); +#endif /* __arm__ */ + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + a4 = vld1q_dup_f32(mtx_a0 + 4); + a5 = vld1q_dup_f32(mtx_a0 + 5); + a6 = vld1q_dup_f32(mtx_a0 + 6); + a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + + a0 = vld1q_dup_f32(mtx_a0 + 0); + a1 = vld1q_dup_f32(mtx_a0 + 1); + a2 = vld1q_dup_f32(mtx_a0 + 2); + a3 = vld1q_dup_f32(mtx_a0 + 3); + b00 = vld1q_f32(mtx_b0); + b10 = vld1q_f32(mtx_b1); + b01 = vld1q_f32(mtx_b0 + 4); + b11 = vld1q_f32(mtx_b1 + 4); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + a4 = vld1q_dup_f32(mtx_a0 + 4); + a5 = vld1q_dup_f32(mtx_a0 + 5); + a6 = vld1q_dup_f32(mtx_a0 + 6); + a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + } + + for(; mtx_b0 < mtx_b0_end_addr;) + { + float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0); + float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1); + float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2); + float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3); + float32x4_t b00 = vld1q_f32(mtx_b0); + float32x4_t b10 = vld1q_f32(mtx_b1); + +#if __arm__ + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0))); + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0))); + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1))); +#endif /* __arm__ */ + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + mtx_a0 += 4; + mtx_b0 += 4; + mtx_b1 += 4; + } + + // Multiply by the weight of matrix product (alpha) + if(multiply_alpha) + { + acc00 = vmulq_f32(acc00, alpha_f32); + acc10 = vmulq_f32(acc10, alpha_f32); + acc20 = vmulq_f32(acc20, alpha_f32); + acc30 = vmulq_f32(acc30, alpha_f32); + acc01 = vmulq_f32(acc01, alpha_f32); + acc11 = vmulq_f32(acc11, alpha_f32); + acc21 = vmulq_f32(acc21, alpha_f32); + acc31 = vmulq_f32(acc31, alpha_f32); + } + + const auto mtx_out0 = reinterpret_cast<float *>(out.ptr()); + const auto mtx_out1 = mtx_out0 + 4; + + if(id.x() < (out_width - 8)) + { + vst1q_f32(mtx_out0, acc00); + vst1q_f32(mtx_out1, acc01); + if(id.y() + 1 < out_height) + { + vst1q_f32(mtx_out0 + out_stride1, acc10); + vst1q_f32(mtx_out1 + out_stride1, acc11); + if(id.y() + 2 < out_height) + { + vst1q_f32(mtx_out0 + out_stride2, acc20); + vst1q_f32(mtx_out1 + out_stride2, acc21); + if(id.y() + 3 < out_height) + { + vst1q_f32(mtx_out0 + out_stride3, acc30); + vst1q_f32(mtx_out1 + out_stride3, acc31); + } + } + } + } + else if(id.x() < (out_width - 4)) + { + vst1q_f32(mtx_out0, acc00); + if(id.y() + 1 < out_height) + { + vst1q_f32(mtx_out0 + out_stride1, acc10); + if(id.y() + 2 < out_height) + { + vst1q_f32(mtx_out0 + out_stride2, acc20); + if(id.y() + 3 < out_height) + { + vst1q_f32(mtx_out0 + out_stride3, acc30); + } + } + } + // Left-over columns + const int columns_left = out_width - id.x() - 4; + for(auto x = 0; x < columns_left; ++x) + { + *(mtx_out1 + x) = acc01[x]; + if(id.y() + 1 < out_height) + { + *(mtx_out1 + x + out_stride1) = acc11[x]; + if(id.y() + 2 < out_height) + { + *(mtx_out1 + x + out_stride2) = acc21[x]; + if(id.y() + 3 < out_height) + { + *(mtx_out1 + x + out_stride3) = acc31[x]; + } + } + } + } + } + else + { + // Left-over columns + const int columns_left = out_width - id.x(); + for(int x = 0; x < columns_left; ++x) + { + *(mtx_out0 + x) = acc00[x]; + if(id.y() + 1 < out_height) + { + *(mtx_out0 + x + out_stride1) = acc10[x]; + if(id.y() + 2 < out_height) + { + *(mtx_out0 + x + out_stride2) = acc20[x]; + if(id.y() + 3 < out_height) + { + *(mtx_out0 + x + out_stride3) = acc30[x]; + } + } + } + } + } + }, + ina, inb, out); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha) +{ + ARM_COMPUTE_UNUSED(info); + const int out_width = static_cast<int>(dst->info()->dimension(0)); + const int out_height = static_cast<int>(dst->info()->dimension(1)); + const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()); + const size_t out_stride = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type()); + const int num_elems_matrix_b_x = rhs->info()->dimension(0); + + // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix + Window win_a(window); + win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1)); + + Window win_b; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if(rhs->info()->num_dimensions() >= 3) + { + win_b = window; + } + // Set step_x and step_y for matrix B. Scale by a factor of 8 the X range as the input transposed matrix A has 8 times less the cols of the dst matrix + win_b.set(Window::DimX, Window::Dimension(window.x().start() / 8, window.x().end() / 8, in_b_stride)); + win_b.set(Window::DimY, Window::Dimension(0, 1, 0)); + + Iterator ina(lhs, win_a); + Iterator inb(rhs, win_b); + Iterator out(dst, window); + + const bool multiply_alpha = !(helpers::float_ops::is_one(alpha)); + + const float16x8_t alpha_f16 = vdupq_n_f16(alpha); + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto *mtx_a0 = reinterpret_cast<const float16_t *>(ina.ptr()); + const auto *mtx_b0 = reinterpret_cast<const float16_t *>(inb.ptr()); + auto *mtx_out = reinterpret_cast<float16_t *>(out.ptr()); + float16x8x4_t c = + { + { + vdupq_n_f16(0.f), + vdupq_n_f16(0.f), + vdupq_n_f16(0.f), + vdupq_n_f16(0.f) + } + }; + + /* + This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values) + |a00 a01 a02 a03 | a04 a05 a06 a07| + |a10 a11 a12 a13 | a14 a15 a16 a17| + |a20 a21 a22 a23 | a24 a25 a26 a27| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 | a40 a50 a60 a70 | ... + |a30 a31 a32 a33 | a34 a35 a36 a37| | a04 a14 a24 a34 || a05 a15 a25 a35 || a06 a15 a26 a36 || a07 a17 a27 a37 | a44 a54 a64 a74 | ... + |a40 a41 a42 a43 | a44 a45 a46 a47| + |a50 a51 a52 a53 | a54 a55 a56 a57| + |a60 a61 a62 a63 | a64 a65 a66 a67| + |a70 a71 a72 a73 | a74 a75 a76 a77| + + After this operation, the dst matrix will have the following shape: [ height * 4, width / 4 ] + + B Matrix has been transposed as shown below + + |b00 b01 b02 b03 b04 b05 b06 b07| + |b10 b11 b12 b13 b14 b15 b16 b17| + |b20 b21 b22 b23 b24 b25 b26 b27| + |b30 b31 b32 b33 b34 b35 b36 b37| + -------------------> + + |b00 b01 b02 b03 b04 b05 b06 b07||b10 b11 b12 b13 b14 b15 b16 b17||b20 b21 b22 b23 b24 b25 b26 b27||b30 b31 b32 b33 b34 b35 b36 b37| + + c.val[0][0] = a00*b00 + a01*b10 + a02*b20 + a03*b30 + c.val[0][1] = a00*b01 + a01*b11 + a02*b21 + a03*b31 + + The size of the dst tensor's XY-plane must be the following shape [ width * 8, height / 8 ]. All other dimensions must have the same size. + */ + const float16_t *mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x; + + for(; mtx_b0 <= (mtx_b0_end_addr - 32);) + + { + const float16x8_t p00 = vld1q_f16(mtx_a0); + const float16x8_t p02 = vld1q_f16(mtx_a0 + 8); + + const float16x8_t q00 = vld1q_f16(mtx_b0); + const float16x8_t q02 = vld1q_f16(mtx_b0 + 8); + const float16x8_t q04 = vld1q_f16(mtx_b0 + 16); + const float16x8_t q06 = vld1q_f16(mtx_b0 + 24); + + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3))); + + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7))); + + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3))); + + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7))); + + mtx_a0 += 16; + mtx_b0 += 32; + } + + for(; mtx_b0 < mtx_b0_end_addr;) + + { + const float16x4_t p00 = vld1_f16(mtx_a0); + const float16x8_t q00 = vld1q_f16(mtx_b0); + + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vget_lane_f16(p00, 0))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vget_lane_f16(p00, 1))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vget_lane_f16(p00, 2))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vget_lane_f16(p00, 3))); + + mtx_a0 += 4; + mtx_b0 += 8; + } + + if(multiply_alpha) + { + c.val[0] = vmulq_f16(c.val[0], alpha_f16); + c.val[1] = vmulq_f16(c.val[1], alpha_f16); + c.val[2] = vmulq_f16(c.val[2], alpha_f16); + c.val[3] = vmulq_f16(c.val[3], alpha_f16); + } + + if(id.x() < (out_width - 8)) + { + vst1q_f16(mtx_out, c.val[0]); + if(id.y() + 1 < out_height) + { + vst1q_f16(mtx_out + 1 * out_stride, c.val[1]); + if(id.y() + 2 < out_height) + { + vst1q_f16(mtx_out + 2 * out_stride, c.val[2]); + if(id.y() + 3 < out_height) + { + vst1q_f16(mtx_out + 3 * out_stride, c.val[3]); + } + } + } + } + else + { + // Left-over columns + const int columns_left = out_width - id.x(); + for(int x = 0; x < columns_left; ++x) + { + *(mtx_out + x) = c.val[0][x]; + if(id.y() + 1 < out_height) + { + *(mtx_out + x + 1 * out_stride) = c.val[1][x]; + if(id.y() + 2 < out_height) + { + *(mtx_out + x + 2 * out_stride) = c.val[2][x]; + if(id.y() + 3 < out_height) + { + *(mtx_out + x + 3 * out_stride) = c.val[3][x]; + } + } + } + } + } + }, + ina, inb, out); +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info) +{ + ARM_COMPUTE_UNUSED(alpha); + + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(lhs); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst); + + if(!is_interleaved) + { + ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(0) != rhs->dimension(1)); + + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON(rhs->dimension(0) != dst->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(1) != dst->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); + } + } + else + { + const int m = reshape_info.m(); + const int n = reshape_info.n(); + const int k = reshape_info.k(); + const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width(); + const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height(); + + /* Interleave */ + TensorShape tensor_shape0{ lhs->tensor_shape() }; + tensor_shape0.set(0, k); + tensor_shape0.set(1, m); + + const TensorInfo tensor_info0 = lhs->clone()->set_tensor_shape(tensor_shape0); + const TensorInfo tensor_info_reshaped0 = lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_interleaved_shape(tensor_info0, mult_interleave4x4_height)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lhs, &tensor_info_reshaped0); + + if(n != 0) /* Transpose */ + { + TensorShape tensor_shape1{ rhs->tensor_shape() }; + tensor_shape1.set(0, n); + tensor_shape1.set(1, k); + + const TensorInfo tensor_info1 = rhs->clone()->set_tensor_shape(tensor_shape1); + const TensorInfo tensor_info_reshaped1 = rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(rhs, &tensor_info_reshaped1); + } + + if(dst->total_size() != 0) + { + if(n != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON(dst->dimension(0) != static_cast<size_t>(n)); + } + ARM_COMPUTE_RETURN_ERROR_ON(dst->dimension(1) != static_cast<size_t>(m)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); + } + } + + return Status{}; +} +} // namespace + +void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); + + // dst tensor auto inizialitation if not yet initialized + TensorShape tensor_shape{ lhs->tensor_shape() }; + tensor_shape.set(0, is_interleaved ? reshape_info.n() : rhs->dimension(0)); + tensor_shape.set(1, is_interleaved ? reshape_info.m() : lhs->dimension(1)); + + auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(tensor_shape)); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(lhs, rhs, dst, alpha, is_interleaved, reshape_info)); + + _alpha = alpha; + + // Configure kernel window + Window win{}; + + // Check if the dst tensor is a vector. If so,the kernel runs the vector-matrix multiplication + const bool is_dst_vector = (dst->dimension(1) == 1); + if(is_dst_vector) + { + const unsigned int num_elems_processed_per_iteration_x = (lhs->data_type() == DataType::F32) ? 16 : 32; + + win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x)); + } + else + { + constexpr unsigned int num_elems_processed_per_iteration_x = 8; + constexpr unsigned int num_elems_processed_per_iteration_y = 4; + + win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + } + + switch(lhs->data_type()) + { + case DataType::F32: + { + _func = (is_dst_vector) ? vector_matrix_multiply_f32 : matrix_matrix_multiply_f32; + break; + } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + { + _func = (is_dst_vector) ? vector_matrix_multiply_f16 : matrix_matrix_multiply_f16; + break; + } +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + default: + { + ARM_COMPUTE_ERROR("Data type not supported"); + break; + } + } + ICPPKernel::configure(win); +} + +Status CpuGemmMatrixMultiplyKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, + const GEMMReshapeInfo &reshape_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(lhs, rhs, dst, alpha, is_interleaved, reshape_info)); + + return Status{}; +} + +void CpuGemmMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + ARM_COMPUTE_ERROR_ON(tensors.empty()); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + const ITensor *lhs = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const ITensor *rhs = tensors.get_const_tensor(TensorType::ACL_SRC_1); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); + + (*_func)(lhs, rhs, dst, window, info, _alpha); +} + +const char *CpuGemmMatrixMultiplyKernel::name() const +{ + return "CpuGemmMatrixMultiplyKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h new file mode 100644 index 0000000000..0b4e01579c --- /dev/null +++ b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMM_MATRIX_MULTIPLY_KERNEL_H +#define ARM_COMPUTE_CPU_GEMM_MATRIX_MULTIPLY_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel to multiply two input matrices "A" and "B". All elements of the output matrix/vector will be multiplied by alpha after the matrix multiplication + * + * @note If the output tensor is a matrix, the implementation assumes that the input tensors @p lhs and @p rhs are both matrices and reshaped respectively with @ref CpuGemmInterleave4x4Kernel" and @ref CpuGemmTranspose1xWKernel + * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p lhs is a vector and the second input tensor @p rhs a matrix. The implementation also assumes that both tensors have not been reshaped + * + */ +class CpuGemmMatrixMultiplyKernel : public ICpuKernel +{ +public: + CpuGemmMatrixMultiplyKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmMatrixMultiplyKernel); + /** Initialise the kernel's input and output. + * + * @note If the output tensor is a matrix, the input matrices @p lhs and @p rhs should be the output of the kernels: @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel + * These two kernels change the layout of the original matrices to be more cache-friendly. + * + * @param[in] lhs Left-handside tensor info containing the interleaved Matrix A or the vector A. Data types supported: F16/F32 + * @param[in] rhs Right-handside tensor info containing the transposed Matrix B if the first input tensor A is not a vector. + * If the output tensor is a vector, rhs must contain the matrix B not reshaped. Data type supported: same as @p lhs + * @param[out] dst Output tensor to store the result of matrix multiplication. Data type supported: same as @p lhs. + * @param[in] alpha Weight of the matrix product + * @param[in] is_interleaved (Optional) True if lhs and rhs have been reshaped respectively using @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel + * @param[in] reshape_info (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how @p lhs and @p rhs have been reshaped + */ + void configure(const ITensorInfo *lhs, const ITensorInfo *rhs, ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmMatrixMultiplyKernel + * + * Similar to @ref CpuGemmMatrixMultiplyKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Common signature for all the matrix multiply functions + * + * @param[in] lhs Left-handside input tensor. Data types supported: F16/F32 + * @param[in] rhs Right-handside input tensor. Data types supported: same as @p lhs + * @param[out] dst The output tensor. Data type supported: same as @p rhs + * @param[in] window Region on which to execute the kernel. + * @param[in] info Thread info metadata. + * @param[in] alpha Weight of the matrix product. + */ + using GemmFunctionPtr = void(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha); + /** Matrix multiply function to use for the particular tensor types passed to configure() */ + GemmFunctionPtr *_func{ nullptr }; + float _alpha{ 1.f }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMM_MATRIX_MULTIPLY_KERNEL_H */ diff --git a/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp b/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp new file mode 100644 index 0000000000..62d5d5f5e9 --- /dev/null +++ b/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +using namespace arm_compute::misc::shape_calculator; + +void CpuGemmTranspose1xWKernel::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*src))); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(CpuGemmTranspose1xWKernel::validate(src, dst)); + + const size_t vector_size = 16 / src->element_size(); + + // Configure kernel window + Window win = calculate_max_window(*src, Steps(vector_size)); + ICPPKernel::configure(win); +} + +Status CpuGemmTranspose1xWKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. + + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_transpose1xW_with_element_size_shape(*src)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + } + + return Status{}; +} + +void CpuGemmTranspose1xWKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + ARM_COMPUTE_ERROR_ON(tensors.empty()); + + /* + * Following an example of how the transposition1xW works when the src data type is F32 + * + * |a00 a01 a02 a03| + * |a10 a11 a12 a13| + * |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 | + * |a30 a31 a32 a33| + * + * The dst matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) + */ + + // Set window for dst tensor. Set to 0 the X and Y dimensions in order to allow multi-threading implementation and future batched matrix multiplications + Window win_out(window); + win_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + + const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); + + Iterator in(src, window); + Iterator out(dst, win_out); + + const size_t in_width = src->info()->dimension(0); + const size_t element_size = src->info()->element_size(); + const size_t out_stride = dst->info()->strides_in_bytes()[1]; + const size_t vector_size = 16 / element_size; + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8_t *in_ptr = in.ptr(); + uint8_t *const out_ptr = out.ptr() + (id.y() * vector_size) * element_size + (id.x() / vector_size) * out_stride; + + for(size_t k = 0; k < vector_size; ++k) + { + // If the src width is not multiple of W, we fill the reference with 0s + if((id.x() + k) >= in_width) + { + std::memset(out_ptr + k * element_size, 0, element_size); + } + else + { + std::memcpy(out_ptr + k * element_size, in_ptr + k * element_size, element_size); + } + } + }, + in, out); +} + +const char *CpuGemmTranspose1xWKernel::name() const +{ + return "CpuGemmTranspose1xWKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuGemmTranspose1xWKernel.h b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h new file mode 100644 index 0000000000..de920b5ed7 --- /dev/null +++ b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMM_TRANSPOSE1xW_KERNEL_H +#define ARM_COMPUTE_CPU_GEMM_TRANSPOSE1xW_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel which transposes the elements of a matrix in chunks of 1xW, where W is equal to (16 / element size of the tensor) + * + * Following an example of how the transposition1xW works when the input data is F32 + * + * @f[ + * \left( \begin{array}{cccc} + * a00 & a01 & a02 & a03 \\ + * a10 & a11 & a12 & a13 \\ + * a20 & a21 & a22 & a23 \\ + * a30 & a31 & a32 & a33 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{ccccccccccccccccc} + * a00 & a01 & a02 & a03 & a10 & a11 & a12 & a13 & a20 & a21 & a22 & a23 & a30 & a31 & a32 & a33 \\ + * \end{array} \right) + * @f] + * + * Following an example of how the transposition1xW works when the input data type is F16 + * + * @f[ + * \left( \begin{array}{cccccccc} + * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 \\ + * a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 \\ + * a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 \\ + * a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc} + * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 & a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 & a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 & a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37\\ + * \end{array} \right) + * @f] + * + * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) + * + */ +class CpuGemmTranspose1xWKernel : public ICpuKernel +{ +public: + CpuGemmTranspose1xWKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmTranspose1xWKernel); + /** Configure kernel for a given list of arguments + * + * @param[in] src Input tensor info. Data types supported: All + * @param[out] dst Output tensor info. Data type supported: same as @p src. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmTranspose1xWKernel + * + * Similar to @ref CpuGemmTranspose1xWKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMM_TRANSPOSE1xW_KERNEL_H */ diff --git a/src/cpu/kernels/CpuIm2ColKernel.cpp b/src/cpu/kernels/CpuIm2ColKernel.cpp new file mode 100644 index 0000000000..13764c49d1 --- /dev/null +++ b/src/cpu/kernels/CpuIm2ColKernel.cpp @@ -0,0 +1,448 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuIm2ColKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include <arm_neon.h> +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <tuple> + +namespace arm_compute +{ +using namespace misc::shape_calculator; +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, + bool has_bias, const Size2D &dilation, unsigned int num_groups) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(input->data_type()) && has_bias); + ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Number of groups greater than one are not supported on Neon"); + + // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions + const unsigned int width_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); + const unsigned total_width = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right(); + const unsigned total_height = input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom(); + ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height)); + + if(output->total_size() > 0) + { + TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); + } + + return Status{}; +} + +template <typename T, bool has_pads> +inline void linearize_volume_nchw(const uint8_t *const in_ptr, + T *out_ptr, + bool has_bias, + int top_left_x, + int top_left_y, + int kernel_width, + int kernel_height, + int kernel_depth, + int input_w, + int input_h, + int input_stride_x, + int input_stride_y, + int input_stride_z, + int pad_value, + int dilation_x, + int dilation_y) +{ + const int kernel_size2 = kernel_width * kernel_height; + const int x_e = top_left_x + kernel_width * dilation_x; + const int y_e = top_left_y + kernel_height * dilation_y; + + // Linearize volume + int d = 0; + // This for loop linearize a volume with 3 slices. This allows: + // 1) to reduce the iterations of the outer for loop "d" + // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs + for(; d <= (kernel_depth - 3); d += 3) + { + for(int y = top_left_y; y < y_e; y += dilation_y) + { + if((y < 0 || y >= input_h) && has_pads) + { + // All the values will be the offset (will be zeros when not quantized) + for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) + { + *(out_ptr + 0 * kernel_size2) = pad_value; + *(out_ptr + 1 * kernel_size2) = pad_value; + *(out_ptr + 2 * kernel_size2) = pad_value; + } + } + else + { + for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) + { + if((x < 0 || x >= input_w) && has_pads) + { + *(out_ptr + 0 * kernel_size2) = pad_value; + *(out_ptr + 1 * kernel_size2) = pad_value; + *(out_ptr + 2 * kernel_size2) = pad_value; + } + else + { + *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x))); + *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x))); + *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x))); + } + } + } + } + out_ptr += 2 * kernel_size2; + } + + // Left over + for(; d < kernel_depth; d++) + { + for(int y = top_left_y; y < y_e; y += dilation_y) + { + if((y < 0 || y >= input_h) && has_pads) + { + // All the values will be the offset (will be zeros when not quantized) + memset(static_cast<void *>(out_ptr), pad_value, kernel_width * sizeof(T)); + out_ptr += kernel_width; + } + else + { + for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) + { + if((x < 0 || x >= input_w) && has_pads) + { + *out_ptr = pad_value; + } + else + { + *out_ptr = *(reinterpret_cast<const T *>(in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x))); + } + } + } + } + } + + // Append 1 if the convolution layer has biases + if(has_bias) + { + *out_ptr = static_cast<T>(1); + } +} + +template <typename T, bool has_pads> +inline void linearize_volume_nhwc(const uint8_t *const in_ptr, + T *out_ptr, + bool has_bias, + int start_x, + int start_y, + int kernel_width, + int kernel_height, + int input_w, + int input_h, + int input_c, + int input_stride_y, + int input_stride_z, + int pad_value, + int dilation_x, + int dilation_y) +{ + const int end_x = start_x + kernel_width * dilation_x; + const int end_y = start_y + kernel_height * dilation_y; + const int pad_quant = kernel_width * input_c; + const int element_size = static_cast<int>(sizeof(T)); + if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && (input_stride_y == input_c * element_size)) + { + for(int y = start_y; y < end_y; y += dilation_y) + { + //optimized for no dilation and no boundary pixels + memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size); + out_ptr += input_c * kernel_width; + } + } + else + { + for(int y = start_y; y < end_y; y += dilation_y) + { + if(y < 0 || y >= input_h) + { + memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size); + out_ptr += pad_quant; + } + else if(dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size) + { + for(int x = start_x; x < end_x; x += dilation_x) + { + if(x < 0 || x >= input_w) + { + memset(static_cast<void *>(out_ptr), pad_value, input_c * element_size); + out_ptr += input_c; + } + else + { + memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * element_size); + out_ptr += input_c; + } + } + } + else + { + //optimized for no dilation and no boundary pixels + memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size); + out_ptr += input_c * kernel_width; + } + } + } + // Append 1 if the convolution layer has biases + if(has_bias) + { + *out_ptr = static_cast<T>(1); + } +} +} // namespace + +template <typename T, bool has_pads, bool is_nchw> +void CpuIm2ColKernel::run_im2col(const ITensor *src, ITensor *dst, const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); + + const int input_w = src->info()->dimension(width_idx); + const int input_h = src->info()->dimension(height_idx); + const int input_c = src->info()->dimension(channel_idx); + const int input_stride_x = src->info()->strides_in_bytes().x(); + const int input_stride_y = src->info()->strides_in_bytes().y(); + const int input_stride_z = src->info()->strides_in_bytes().z(); + const int pad_left = _conv_info.pad_left(); + const int pad_top = _conv_info.pad_top(); + const int stride_x = _conv_info.stride().first; + const int stride_y = _conv_info.stride().second; + const int pad_value = is_data_type_quantized(src->info()->data_type()) ? src->info()->quantization_info().uniform().offset : 0; + + Window window_in_out(window); + // The first three dimensions of the input and output are increased by the inner loops + window_in_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_in_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Create iterators + Iterator in(src, window_in_out); + Iterator out(dst, window_in_out); + + execute_window_loop(window, [&](const Coordinates & id) + { + const int start_w = id[width_idx] * stride_x - pad_left; + const int start_h = id[height_idx] * stride_y - pad_top; + + // Get pointers + const uint8_t *const input_ptr = in.ptr(); + auto output_ptr = reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) * dst->info()->strides_in_bytes().y()); + + // Linearize volume + if(is_nchw) + { + linearize_volume_nchw<T, has_pads>(input_ptr, + output_ptr, + _has_bias, + start_w, + start_h, + _kernel_width, + _kernel_height, + input_c, + input_w, + input_h, + input_stride_x, + input_stride_y, + input_stride_z, + pad_value, + _dilation.x(), + _dilation.y()); + } + else + { + linearize_volume_nhwc<T, has_pads>(input_ptr, + output_ptr, + _has_bias, + start_w, + start_h, + _kernel_width, + _kernel_height, + input_w, + input_h, + input_c, + input_stride_y, + input_stride_z, + pad_value, + _dilation.x(), + _dilation.y()); + } + }, + in, out); +} + +void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, + bool has_bias, const Size2D &dilation, unsigned int num_groups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups)); + ARM_COMPUTE_UNUSED(num_groups); + + _data_layout = src->data_layout(); + const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); + + _conv_info = conv_info; + _kernel_width = kernel_dims.width; + _kernel_height = kernel_dims.height; + _dilation = dilation; + _convolved_dims = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx), + _kernel_width, _kernel_height, + _conv_info, _dilation); + _has_bias = has_bias; + + if(_data_layout == DataLayout::NCHW) + { + switch(src->data_type()) + { + case DataType::F32: + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, true> : &CpuIm2ColKernel::run_im2col<float, true, true>; + break; +#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) + case DataType::BFLOAT16: + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, true> : &CpuIm2ColKernel::run_im2col<bfloat16, true, true>; + break; +#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */ +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, true> : &CpuIm2ColKernel::run_im2col<float16_t, true, true>; + break; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + case DataType::QASYMM8_SIGNED: + case DataType::QASYMM8: + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<qasymm8_t, false, true> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, true>; + break; + default: + ARM_COMPUTE_ERROR("Data type not supported"); + break; + } + } + else + { + switch(src->data_type()) + { + case DataType::F32: + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, false> : &CpuIm2ColKernel::run_im2col<float, true, false>; + break; +#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) + case DataType::BFLOAT16: + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, false> : &CpuIm2ColKernel::run_im2col<bfloat16, true, false>; + break; +#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */ +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, false> : &CpuIm2ColKernel::run_im2col<float16_t, true, false>; + break; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + case DataType::QASYMM8: + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<uint8_t, false, false> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>; + break; + case DataType::QASYMM8_SIGNED: + _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<int8_t, false, false> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>; + break; + default: + ARM_COMPUTE_ERROR("Data type not supported"); + break; + } + } + + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, false))); + + std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx), + kernel_dims.width, kernel_dims.height, + conv_info, dilation); + + Window win = calculate_max_window(*src, Steps()); + win.set(width_idx, Window::Dimension(0, convolved_dims.first, 1)); + win.set(height_idx, Window::Dimension(0, convolved_dims.second, 1)); + win.set(channel_idx, Window::Dimension(0, 1, 1)); + // Configure kernel window + ICpuKernel::configure(win); +} + +Status CpuIm2ColKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, + bool has_bias, const Size2D &dilation, unsigned int num_groups) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups)); + return Status{}; +} + +void CpuIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + (this->*_func)(src, dst, window); +} +const char *CpuIm2ColKernel::name() const +{ + return "CpuIm2ColKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/CpuIm2ColKernel.h b/src/cpu/kernels/CpuIm2ColKernel.h new file mode 100644 index 0000000000..fc8ae056bb --- /dev/null +++ b/src/cpu/kernels/CpuIm2ColKernel.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_IM2COL_KERNEL_H +#define ARM_COMPUTE_CPU_IM2COL_KERNEL_H + +#include "arm_compute/core/Size2D.h" +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +class ITensor; +namespace cpu +{ +namespace kernels +{ +/** Interface for the im2col reshape kernel. + * + * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column. + * It is used to transform a convolution to a plain matrix multiplication. + * + * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have: + * + * @f[ + * \left( \begin{array}{cccc} + * a00 & a01 & a02 & a03 \\ + * a10 & a11 & a12 & a13 \\ + * a20 & a21 & a22 & a23 \\ + * a30 & a31 & a32 & a33 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{ccccccccc} + * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\ + * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\ + * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\ + * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\ + * \end{array} \right) + * @f] + */ +class CpuIm2ColKernel : public ICpuKernel +{ +public: + /** Default constructor */ + CpuIm2ColKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuIm2ColKernel); + /** Set the input and output of the kernel. + * + * @param[in] src The input tensor info to convert. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32 + * Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false + * @param[out] dst The output tensor info. Data types supported: Same as @p input + * @param[in] kernel_dims The kernel dimensions (width and height). + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] has_bias In case biases are provided expands the matrix with 1. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, + bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuIm2ColKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, + bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Template function to run im2col + * + * @param[in] src The input tensor info + * @param[out] dst The output tensor info + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + template <typename T, bool has_pads, bool is_nchw> + void run_im2col(const ITensor *src, ITensor *dst, const Window &window); + + /** Common signature for all the specialised im2col functions + * + * @param[in] window Region on which to execute the kernel. + */ + using Im2ColFunctionPtr = void (CpuIm2ColKernel::*)(const ITensor *src, ITensor *dst, const Window &window); + + Im2ColFunctionPtr _func{ nullptr }; + std::pair<unsigned int, unsigned int> _convolved_dims{}; + PadStrideInfo _conv_info{}; + unsigned int _kernel_width{ 0 }; + unsigned int _kernel_height{ 0 }; + bool _has_bias{ false }; + Size2D _dilation{ 1U, 1U }; + DataLayout _data_layout{ DataLayout::UNKNOWN }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /*ARM_COMPUTE_CPU_IM2COL_KERNEL_H */ diff --git a/src/cpu/kernels/CpuMulKernel.cpp b/src/cpu/kernels/CpuMulKernel.cpp new file mode 100644 index 0000000000..da7b6d7d66 --- /dev/null +++ b/src/cpu/kernels/CpuMulKernel.cpp @@ -0,0 +1,1729 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuMulKernel.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NESymm.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +const float scale255_constant = 1.f / 255.f; +const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant); +const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f); + +inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) +{ + ARM_COMPUTE_UNUSED(overflow_policy); + ARM_COMPUTE_UNUSED(rounding_policy); + + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S16, DataType::QSYMM16, + DataType::S32, DataType::F16, DataType::F32); + if(is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized"); + } + + if(dst->total_size() > 0) + { + const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + // clang-format off + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(src1->data_type() == src2->data_type() && src2->data_type() == dst->data_type()) && + !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) && + !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::S16 && dst->data_type() == DataType::S16) && + !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) && + !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) && + !(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32) + , "Invalid data type combination"); + // clang-format on + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 dst"); + } + + if(std::abs(scale - scale255_constant) < 0.00001f) + { + ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && dst->data_type() == DataType::S32, + "Scale == 1/255 is not supported if input and dst are of data type S32"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO); + + int exponent = 0; + const float normalized_mantissa = std::frexp(scale, &exponent); + + // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15 + // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14 + // Moreover, it will be negative as we deal with 1/2^n + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255"); + } + + return Status{}; +} + +/* Scales a given vector by 1/255. + * + * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats. + * + * @param in Input vector to scale. + * @return Scaled dst rounded to nearest (round half up). + */ +inline int32x4_t scale255_S32_S32(int32x4_t in) +{ + // Scale + const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q); + // Round to nearest (round half up) + // Add +0.5 for all values + // Afterwards vcvt rounds toward zero + return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q)); +} + +inline uint16x8_t scale255_U16_U16(uint16x8_t in) +{ + const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in)))); + const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in)))); + return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1))); +} + +template <typename T> +inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>::type +vquantize(float32x4x4_t val, const UniformQuantizationInfo &info) +{ + return vquantize_signed(val, info); +} + +template <typename T> +inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>::type +vquantize(float32x4x4_t val, const UniformQuantizationInfo &info) +{ + return vquantize(val, info); +} + +template <typename T> +void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) +{ + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); + + const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform(); + const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset }; + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator dst(out, win); + + using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type; + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<T *>(dst.ptr()); + + const auto broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + + // Dequantize inputs + const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo); + const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo); + + const float32x4x4_t out_f32x4x4 = + { + vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), + vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), + vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), + vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), + }; + + // Quantize dst + const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info); + wrapper::vstore(output_ptr + x, result); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + // Dequantize inputs + const T src1 = *(non_broadcast_input_ptr + x); + const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo); + const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo); + const float tmp_f = tmp_in1 * tmp_in2; + + // Quantize dst + const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info); + *(output_ptr + x) = tmp_qua; + } + }, + broadcast_input, non_broadcast_input, dst); + } + else + { + const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform(); + + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<T *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto input1_q = wrapper::vloadq(input1_ptr + x); + const auto input2_q = wrapper::vloadq(input2_ptr + x); + + // Dequantize inputs + const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info); + const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info); + + const float32x4x4_t out_f32x4x4 = + { + vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), + vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), + vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), + vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), + }; + + // Quantize dst + const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info); + wrapper::vstore(output_ptr + x, result); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + // Dequantize inputs + const T src1 = *(input1_ptr + x); + const T src2 = *(input2_ptr + x); + const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info); + const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info); + const float tmp_f = tmp_in1 * tmp_in2; + + // Quantize dst + const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info); + *(output_ptr + x) = tmp_qua; + } + }, + input1, input2, dst); + } +} + +void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) +{ + const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform(); + const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform(); + + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset }; + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const qsymm16x8x2_t input1_q = + { + { + vld1q_s16(input1_ptr + x), + vld1q_s16(input1_ptr + x + 8), + } + }; + const qsymm16x8x2_t input2_q = + { + { + vld1q_s16(input2_ptr + x), + vld1q_s16(input2_ptr + x + 8), + } + }; + + // Dequantize inputs + const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info); + const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info); + + const float32x4x4_t out_f32x4x4 = + { + vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]), + vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]), + vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]), + vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]), + }; + + const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info); + vst1q_s16(output_ptr + x, result.val[0]); + vst1q_s16(output_ptr + x + 8, result.val[1]); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + // Dequantize inputs + float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale; + float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale; + float tmp_f = tmp_in1 * tmp_in2; + + // Quantize dst, lrintf() has same rounding mode as vcombine_s16 + int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale); + qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); + *(output_ptr + x) = tmp_qua; + } + }, + input1, input2, dst); +} + +void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale) +{ + ARM_COMPUTE_UNUSED(scale); + + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const qsymm16x8x2_t input1_q = + { + { + vld1q_s16(input1_ptr + x), + vld1q_s16(input1_ptr + x + 8), + } + }; + const qsymm16x8x2_t input2_q = + { + { + vld1q_s16(input2_ptr + x), + vld1q_s16(input2_ptr + x + 8), + } + }; + + const int32x4x4_t in1_s32 = + { + { + vmovl_s16(vget_low_s16(input1_q.val[0])), + vmovl_s16(vget_high_s16(input1_q.val[0])), + vmovl_s16(vget_low_s16(input1_q.val[1])), + vmovl_s16(vget_high_s16(input1_q.val[1])), + } + }; + const int32x4x4_t in2_s32 = + { + { + vmovl_s16(vget_low_s16(input2_q.val[0])), + vmovl_s16(vget_high_s16(input2_q.val[0])), + vmovl_s16(vget_low_s16(input2_q.val[1])), + vmovl_s16(vget_high_s16(input2_q.val[1])), + } + }; + + const int32x4x4_t result = + { + { + vmulq_s32(in1_s32.val[0], in2_s32.val[0]), + vmulq_s32(in1_s32.val[1], in2_s32.val[1]), + vmulq_s32(in1_s32.val[2], in2_s32.val[2]), + vmulq_s32(in1_s32.val[3], in2_s32.val[3]), + } + }; + + vst1q_s32(output_ptr + x, result.val[0]); + vst1q_s32(output_ptr + x + 4, result.val[1]); + vst1q_s32(output_ptr + x + 8, result.val[2]); + vst1q_s32(output_ptr + x + 12, result.val[3]); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x)); + *(output_ptr + x) = tmp; + } + }, + input1, input2, dst); +} + +template <bool is_scale255, bool is_sat> +void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) +{ + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + const int window_step_x = 16 / sizeof(uint8_t); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x); + const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x); + + uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1)); + const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2)); + uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1)); + const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2)); + + tmp1_high = vmulq_u16(tmp1_high, tmp2_high); + tmp1_low = vmulq_u16(tmp1_low, tmp2_low); + + if(is_scale255) + { + tmp1_high = scale255_U16_U16(tmp1_high); + tmp1_low = scale255_U16_U16(tmp1_low); + } + else + { + const int16x8_t vn = vdupq_n_s16(-n); + + if(is_sat) + { + tmp1_high = vqshlq_u16(tmp1_high, vn); + tmp1_low = vqshlq_u16(tmp1_low, vn); + } + else + { + tmp1_high = vshlq_u16(tmp1_high, vn); + tmp1_low = vshlq_u16(tmp1_low, vn); + } + } + if(is_sat) + { + vst1q_u8(output_ptr, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high))); + } + else + { + vst1q_u8(output_ptr, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high))); + } + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x)); + + if(is_scale255) + { + float tmp_f = static_cast<float>(tmp) * scale255_constant; + tmp = static_cast<uint16_t>(tmp_f + 0.5f); + } + else + { + tmp >>= n; + } + if(is_sat && tmp > 255) + { + tmp = 255; + } + *(output_ptr + x) = static_cast<uint8_t>(tmp); + } + }, + input1, input2, dst); +} + +template <bool is_scale255, bool is_sat> +inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &src2, int n) +{ + int32x4_t tmp1_high = vmovl_s16(vget_high_s16(src1)); + const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(src2)); + int32x4_t tmp1_low = vmovl_s16(vget_low_s16(src1)); + const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(src2)); + + tmp1_high = vmulq_s32(tmp1_high, tmp2_high); + tmp1_low = vmulq_s32(tmp1_low, tmp2_low); + + if(is_scale255) + { + tmp1_high = scale255_S32_S32(tmp1_high); + tmp1_low = scale255_S32_S32(tmp1_low); + } + else + { + // Right shift amount + const int32x4_t vn = vdupq_n_s32(-n); + // Left shift amount + const int32x4_t vnl = vdupq_n_s32(n); + // Calculate conversion bit + const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high); + const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low); + const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31); + const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31); + const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high); + const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low); + const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s); + const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s); + if(is_sat) + { + tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn); + tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn); + } + else + { + tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn); + tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn); + } + } + + if(is_sat) + { + return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high)); + } + else + { + return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high)); + } +} + +template <bool is_scale255, bool is_sat> +inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n) +{ + const int16x8x2_t result = + { + { + // First 8 elements + mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n), + // Second 8 elements + mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n) + } + }; + + return result; +} + +template <bool is_scale255, bool is_sat> +void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) +{ + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8x2_t ta1 = + { + { + vld1q_s16(input1_ptr + x), + vld1q_s16(input1_ptr + x + 8), + } + }; + const int16x8x2_t ta2 = + { + { + vld1q_s16(input2_ptr + x), + vld1q_s16(input2_ptr + x + 8), + } + }; + const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n); + + vst1q_s16(output_ptr + x, result.val[0]); + vst1q_s16(output_ptr + x + 8, result.val[1]); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x)); + + if(is_scale255) + { + float tmp_f = static_cast<float>(tmp) * scale255_constant; + + tmp = static_cast<int32_t>(tmp_f + 0.5f); + } + else + { + if(tmp >= 0) + { + tmp >>= n; + } + else + { + uint32_t mask = (1u << n) - 1; + tmp = (tmp + static_cast<int32_t>(mask)) >> n; + } + } + if(is_sat) + { + tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); + } + *(output_ptr + x) = static_cast<int16_t>(tmp); + } + }, + input1, input2, dst); +} + +template <bool is_sat> +inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &src2, int n) +{ + const int32x2_t input1_1 = vget_low_s32(src1); + const int32x2_t input2_1 = vget_low_s32(src2); + const int32x2_t input1_2 = vget_high_s32(src1); + const int32x2_t input2_2 = vget_high_s32(src2); + + int64x2_t tmp_1 = vmull_s32(input1_1, input2_1); + int64x2_t tmp_2 = vmull_s32(input1_2, input2_2); + + // Apply scaling, conversion and rounding (round to zero) + // Right shift amount + const int64x2_t vn = vdupq_n_s64(-n); + // Left shift amount + const int64x2_t vnl = vdupq_n_s64(n); + // Calculate conversion bit + const uint64x2_t tmp_1_u = vreinterpretq_u64_s64(tmp_1); + const uint64x2_t sign_1 = vshrq_n_u64(tmp_1_u, 63); + const int64x2_t sign_1_s = vreinterpretq_s64_u64(sign_1); + const int64x2_t convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s); + + const uint64x2_t tmp_2_u = vreinterpretq_u64_s64(tmp_2); + const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63); + const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2); + const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s); + if(is_sat) + { + tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn); + tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn); + return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2)); + } + else + { + tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn); + tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn); + return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2)); + } +} + +template <bool is_sat> +inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n) +{ + const int32x4x2_t result = + { + { + // First 4 elements + mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n), + // Second 4 elements + mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n) + } + }; + + return result; +} + +template <bool is_sat> +void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 8; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator dst(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + + const int32_t broadcast_value = *reinterpret_cast<const int32_t *>(broadcast_input.ptr()); + const auto broadcast_value_vec = vdupq_n_s32(broadcast_value); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x2_t broadcast_v = + { + { + broadcast_value_vec, + broadcast_value_vec, + } + }; + const int32x4x2_t non_broadcast_v = + { + { + vld1q_s32(non_broadcast_input_ptr + x), + vld1q_s32(non_broadcast_input_ptr + x + 4), + } + }; + const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n); + + vst1q_s32(output_ptr + x, result.val[0]); + vst1q_s32(output_ptr + x + 4, result.val[1]); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + int64_t tmp = static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x)); + + if(tmp >= 0) + { + tmp >>= n; + } + else + { + uint64_t mask = ((uint64_t)1u << n) - 1; + tmp = (tmp + static_cast<int64_t>(mask)) >> n; + } + if(is_sat) + { + tmp = utility::clamp<int64_t, int32_t>(tmp); + } + *(output_ptr + x) = static_cast<int32_t>(tmp); + } + }, + broadcast_input, non_broadcast_input, dst); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int32x4x2_t ta1 = + { + { + vld1q_s32(input1_ptr + x), + vld1q_s32(input1_ptr + x + 4), + } + }; + const int32x4x2_t ta2 = + { + { + vld1q_s32(input2_ptr + x), + vld1q_s32(input2_ptr + x + 4), + } + }; + const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n); + + vst1q_s32(output_ptr + x, result.val[0]); + vst1q_s32(output_ptr + x + 4, result.val[1]); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x)); + + if(tmp >= 0) + { + tmp >>= n; + } + else + { + uint64_t mask = ((uint64_t)1u << n) - 1; + tmp = (tmp + static_cast<int64_t>(mask)) >> n; + } + if(is_sat) + { + tmp = utility::clamp<int64_t, int32_t>(tmp); + } + *(output_ptr + x) = static_cast<int32_t>(tmp); + } + }, + input1, input2, dst); + } +} + +void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 16 / sizeof(float); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); + + using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type; + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator dst(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<float *>(dst.ptr()); + + const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); + const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{}); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = broadcast_value * non_broadcast_v * scale; + } + }, + broadcast_input, non_broadcast_input, dst); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<float *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto ta1 = wrapper::vloadq(input1_ptr + x); + const auto ta2 = wrapper::vloadq(input2_ptr + x); + const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{}); + const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto ta1 = *(input1_ptr + x); + const auto ta2 = *(input2_ptr + x); + *(output_ptr + x) = ta1 * ta2 * scale; + } + }, + input1, input2, dst); + } +} + +void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 8 / sizeof(float); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); + + using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type; + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator dst(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<float *>(dst.ptr()); + + const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x); + float32x4_t b = vdupq_n_f32(broadcast_value); + + const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f }; + const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{}); + const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{}); + const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{}); + const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{}); + + const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10); + const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11); + + float32x4_t res = wrapper::vmul(tmp0, b); + b = wrapper::vmul(b, mask); + + res = wrapper::vmla(res, tmp1, b); + wrapper::vstore(output_ptr + 2 * x, res); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x); + const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1); + auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1); + auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0); + *(output_ptr + 2 * x) = res1; + *(output_ptr + 2 * x + 1) = res2; + } + }, + broadcast_input, non_broadcast_input, dst); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<float *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x); + float32x4_t b = wrapper::vloadq(input2_ptr + 2 * x); + + const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f }; + const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{}); + const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{}); + const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{}); + const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{}); + + const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10); + const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11); + + float32x4_t res = wrapper::vmul(tmp0, b); + + b = wrapper::vrev64(b); + b = wrapper::vmul(b, mask); + + res = wrapper::vmla(res, tmp1, b); + wrapper::vstore(output_ptr + 2 * x, res); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto a0 = *(input1_ptr + 2 * x); + const auto a1 = *(input1_ptr + 2 * x + 1); + const auto b0 = *(input2_ptr + 2 * x); + const auto b1 = *(input2_ptr + 2 * x + 1); + auto res1 = a0 * b0 - a1 * b1; + auto res2 = a0 * b1 + a1 * b0; + *(output_ptr + 2 * x) = res1; + *(output_ptr + 2 * x + 1) = res2; + } + }, + input1, input2, dst); + } +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + constexpr int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x(); + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1; + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator dst(out, win); + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr()); + const auto broadcast_value = *reinterpret_cast<const float16_t *>(broadcast_input.ptr()); + const float16x8x2_t broadcast_value_vec = + { + { + vdupq_n_f16(broadcast_value), + vdupq_n_f16(broadcast_value), + } + }; + const auto scale_vec = vdupq_n_f16(scale); + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float16x8x2_t non_broadcast_v = + { + { + vld1q_f16(non_broadcast_input_ptr + x), + vld1q_f16(non_broadcast_input_ptr + x + 8), + } + }; + const float16x8x2_t result = + { + { + vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec), + vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec), + } + }; + vst1q_f16(output_ptr + x, result.val[0]); + vst1q_f16(output_ptr + x + 8, result.val[1]); + } + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = broadcast_value * non_broadcast_v * scale; + } + }, + broadcast_input, non_broadcast_input, dst); + } + else + { + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr()); + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float16x8x2_t ta1 = + { + { + vld1q_f16(input1_ptr + x), + vld1q_f16(input1_ptr + x + 8), + } + }; + const float16x8x2_t ta2 = + { + { + vld1q_f16(input2_ptr + x), + vld1q_f16(input2_ptr + x + 8), + } + }; + const float16x8_t scale_vec = vdupq_n_f16(scale); + const float16x8x2_t result = + { + { + vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec), + vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec), + } + }; + vst1q_f16(output_ptr + x, result.val[0]); + vst1q_f16(output_ptr + x + 8, result.val[1]); + } + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto ta1 = *(input1_ptr + x); + const auto ta2 = *(input2_ptr + x); + *(output_ptr + x) = ta1 * ta2 * scale; + } + }, + input1, input2, dst); + } +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +template <bool is_scale255, bool is_sat> +void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) +{ + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + const int window_step_x = 16 / sizeof(uint8_t); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t bv = wrapper::vloadq(input2_ptr + x); + const uint8x16_t av = wrapper::vloadq(input1_ptr + x); + + uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av)); + uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av)); + tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv))); + tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv))); + + if(is_scale255) + { + tmp_low = scale255_U16_U16(tmp_low); + tmp_high = scale255_U16_U16(tmp_high); + } + else + { + const int16x8_t vn = vdupq_n_s16(-n); + + if(is_sat) + { + tmp_low = vqshlq_u16(tmp_low, vn); + tmp_high = vqshlq_u16(tmp_high, vn); + } + else + { + tmp_low = vshlq_u16(tmp_low, vn); + tmp_high = vshlq_u16(tmp_high, vn); + } + } + + if(is_sat) + { + static const uint16x8_t max = vdupq_n_u16(SHRT_MAX); + + tmp_low = vminq_u16(tmp_low, max); + tmp_high = vminq_u16(tmp_high, max); + } + + vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low)); + vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x)); + + if(is_scale255) + { + float tmp_f = static_cast<float>(tmp) * scale255_constant; + tmp = static_cast<int32_t>(tmp_f + 0.5f); + } + else + { + tmp >>= n; + } + + if(is_sat) + { + tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp; + } + + *(output_ptr + x) = static_cast<int16_t>(tmp); + } + }, + input1, input2, dst); +} + +template <bool is_scale255, bool is_sat> +void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) +{ + // Create input windows + Window win = window; + Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src1, input1_win); + Iterator input2(src2, input2_win); + Iterator dst(out, win); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8x2_t ta1 = + { + { + vld1q_s16(input1_ptr + x), + vld1q_s16(input1_ptr + x + 8), + } + }; + const uint8x8x2_t ta2u = + { + { + vld1_u8(input2_ptr + x), + vld1_u8(input2_ptr + x + 8), + } + }; + const int16x8x2_t ta2 = + { + { + vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])), + vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1])) + } + }; + + const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n); + + vst1q_s16(output_ptr + x, result.val[0]); + vst1q_s16(output_ptr + x + 8, result.val[1]); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x)); + + if(is_scale255) + { + float tmp_f = static_cast<float>(tmp) * scale255_constant; + + tmp = static_cast<int32_t>(tmp_f + 0.5f); + } + else + { + if(tmp >= 0) + { + tmp >>= n; + } + else + { + uint32_t mask = (1u << n) - 1; + tmp = (tmp + static_cast<int32_t>(mask)) >> n; + } + } + if(is_sat) + { + tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp); + } + *(output_ptr + x) = static_cast<int16_t>(tmp); + } + }, + input1, input2, dst); +} + +template <bool is_scale255, bool is_sat> +void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n) +{ + // Simply swap the two input buffers + mul_S16_U8_S16<is_scale255, is_sat>(src2, src1, out, window, n); +} +} // namespace + +void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) +{ + ARM_COMPUTE_UNUSED(rounding_policy); + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy)); + + const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); + + // Auto initialize dst if not initialized + set_shape_if_empty(*dst, out_shape); + + _scale = scale; + _scale_exponent = 0; + _func_quantized = nullptr; + _func_int = nullptr; + _func_float = nullptr; + + bool is_scale_255 = false; + // Check and validate scaling factor + if(std::abs(scale - scale255_constant) < 0.00001f) + { + is_scale_255 = true; + } + else + { + int exponent = 0; + + std::frexp(scale, &exponent); + + // Store the positive exponent. We know that we compute 1/2^n + // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5 + _scale_exponent = std::abs(exponent - 1); + } + + const DataType dt_input1 = src1->data_type(); + const DataType dt_input2 = src2->data_type(); + const DataType dt_output = dst->data_type(); + const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE); + + switch(dt_input1) + { + case DataType::QASYMM8: + if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8) + { + _func_quantized = &mul_saturate_quantized_8<uint8_t>; + } + break; + case DataType::QASYMM8_SIGNED: + if(dt_input2 == DataType::QASYMM8_SIGNED) + { + _func_quantized = &mul_saturate_quantized_8<int8_t>; + ; + } + break; + case DataType::QSYMM16: + if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16) + { + _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16; + } + else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32) + { + _func_int = &mul_QSYMM16_QSYMM16_S32; + } + break; + case DataType::S16: + if(DataType::U8 == dt_input2 && DataType::S16 == dt_output) + { + if(is_scale_255) + { + _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>; + } + else + { + _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>; + } + } + if(DataType::S16 == dt_input2 && DataType::S16 == dt_output) + { + if(is_scale_255) + { + _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>; + } + else + { + _func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>; + } + } + break; + case DataType::S32: + if(DataType::S32 == dt_input2 && DataType::S32 == dt_output) + { + _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>; + } + break; + case DataType::U8: + if(DataType::U8 == dt_input2 && DataType::U8 == dt_output) + { + if(is_scale_255) + { + _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>; + } + else + { + _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>; + } + } + else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output) + { + if(is_scale_255) + { + _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>; + } + else + { + _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>; + } + } + else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output) + { + if(is_scale_255) + { + _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>; + } + else + { + _func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>; + } + } + break; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + _func_float = &mul_F16_F16_F16; + break; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + case DataType::F32: + _func_float = &mul_F32_F32_F32; + break; + default: + ARM_COMPUTE_ERROR("You called with the wrong img formats"); + } + + // Configure kernel window + Window win = calculate_max_window(out_shape); + + ICpuKernel::configure(win); +} + +Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy)); + + return Status{}; +} + +void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + if(_func_quantized != nullptr) + { + (*_func_quantized)(src1, src2, dst, window, _scale); + } + else if(_func_int != nullptr) + { + (*_func_int)(src1, src2, dst, window, _scale_exponent); + } + else + { + ARM_COMPUTE_ERROR_ON(_func_float == nullptr); + (*_func_float)(src1, src2, dst, window, _scale); + } +} +const char *CpuMulKernel::name() const +{ + return "CpuMulKernel"; +} +namespace +{ +Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F32); + + const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + + // Validate in case of configured dst + if(dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); + } + + return Status{}; +} +} // namespace + +void CpuComplexMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst)); + + const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); + + // Auto initialize dst if not initialized + const TensorInfo out_info(out_shape, src1->num_channels(), src1->data_type()); + auto_init_if_empty(*dst, out_info); + + // Configure kernel window + Window win = calculate_max_window(out_shape); + + ICpuKernel::configure(win); +} + +Status CpuComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst)); + + return Status{}; +} + +void CpuComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + c_mul_F32_F32_F32_n(src1, src2, dst, window); +} + +const char *CpuComplexMulKernel::name() const +{ + return "CpuComplexMulKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuMulKernel.h b/src/cpu/kernels/CpuMulKernel.h new file mode 100644 index 0000000000..b65ec20044 --- /dev/null +++ b/src/cpu/kernels/CpuMulKernel.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_MUL_KERNEL_H +#define ARM_COMPUTE_CPU_MUL_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the kernel to perform multiplication between two tensors */ +class CpuMulKernel : public ICpuKernel +{ +public: + CpuMulKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuMulKernel); + /** Initialise the kernel's input, dst and border mode. + * + * Valid configurations (Src1,Src2) -> Dst : + * + * Support: Broadcast? Scale=1/255? + * - (U8,U8) -> U8, S16 N Y + * - (U8,S16) -> S16 N Y + * - (S16,U8) -> S16 N Y + * - (S16,S16) -> S16 N Y + * - (S32,S32) -> S32 Y N + * - (F16,F16) -> F16 N Y + * - (F32,F32) -> F32 Y Y + * - (QASYMM8,QASYMM8) -> QASYMM8 Y Y + * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED Y Y + * - (QSYMM16,QSYMM16) -> QSYMM16, S32 N Y + * + * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. + * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. + * + * @param[in] src1 First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 + * @param[in] src2 Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 + * @param[out] dst Dst tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. + * If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255 + * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype + * @param[in] rounding_policy Rounding policy. + */ + void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuMulKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); + + // Inherited methods overridden + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Common signature for all the specialised multiplication functions with integer scaling factor + * + * @param[in] src1 Src1 tensor object. + * @param[in] src2 Src2 tensor object. + * @param[out] dst Dst tensor object. + * @param[in] window Region on which to execute the kernel + * @param[in] scale Integer scale factor. + */ + using MulFunctionInt = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, int scale); + /** Common signature for all the specialised multiplication functions with float scaling factor + * + * @param[in] src1 Src1 tensor object. + * @param[in] src2 Src2 tensor object. + * @param[out] dst Dst tensor object. + * @param[in] window Region on which to execute the kernel + * @param[in] scale Float scale factor. + */ + using MulFunctionFloat = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale); + /** Common signature for all the specialised QASYMM8 multiplication functions with float scaling factor + * + * @param[in] src1 Src1 tensor object. + * @param[in] src2 Src2 tensor object. + * @param[out] dst Dst tensor object. + * @param[in] window Region on which to execute the kernel + * @param[in] scale Float scale factor. + * + */ + using MulFunctionQuantized = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale); + + MulFunctionFloat *_func_float{ nullptr }; + MulFunctionInt *_func_int{ nullptr }; + MulFunctionQuantized *_func_quantized{ nullptr }; + float _scale{ 0 }; + int _scale_exponent{ 0 }; +}; + +/** Interface for the complex pixelwise multiplication kernel. */ +class CpuComplexMulKernel : public ICpuKernel +{ +public: + CpuComplexMulKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuComplexMulKernel); + /** Initialise the kernel's src, dst and border mode. + * + * @param[in] src1 An src tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor). + * @param[in] src2 An src tensor. Data types supported: same as @p src1. Number of channels supported: same as @p src1. + * @param[out] dst The dst tensor, Data types supported: same as @p src1. Number of channels supported: same as @p src1. + */ + void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuComplexMulKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_MUL_KERNEL_H */ diff --git a/src/cpu/kernels/CpuPermuteKernel.cpp b/src/cpu/kernels/CpuPermuteKernel.cpp new file mode 100644 index 0000000000..d65e011032 --- /dev/null +++ b/src/cpu/kernels/CpuPermuteKernel.cpp @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuPermuteKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace +{ +#include "src/core/NEON/kernels/convolution/common/shims.hpp" +} // namespace + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +inline bool is_permutation_supported(const PermutationVector &v) +{ + static const std::array<PermutationVector, 2> permutations2 = + { + { + PermutationVector(0U, 1U), + PermutationVector(1U, 0U), + } + }; + static const std::array<PermutationVector, 6> permutations3 = + { + { + PermutationVector(2U, 0U, 1U), + PermutationVector(1U, 2U, 0U), + PermutationVector(0U, 1U, 2U), + PermutationVector(0U, 2U, 1U), + PermutationVector(1U, 0U, 2U), + PermutationVector(2U, 1U, 0U), + } + }; + static const std::array<PermutationVector, 24> permutations4 = + { + { + PermutationVector(0U, 1U, 2U, 3U), + PermutationVector(1U, 0U, 2U, 3U), + PermutationVector(2U, 0U, 1U, 3U), + PermutationVector(0U, 2U, 1U, 3U), + PermutationVector(1U, 2U, 0U, 3U), + PermutationVector(2U, 1U, 0U, 3U), + PermutationVector(2U, 1U, 3U, 0U), + PermutationVector(1U, 2U, 3U, 0U), + PermutationVector(3U, 2U, 1U, 0U), + PermutationVector(2U, 3U, 1U, 0U), + PermutationVector(1U, 3U, 2U, 0U), + PermutationVector(3U, 1U, 2U, 0U), + PermutationVector(3U, 0U, 2U, 1U), + PermutationVector(0U, 3U, 2U, 1U), + PermutationVector(2U, 3U, 0U, 1U), + PermutationVector(3U, 2U, 0U, 1U), + PermutationVector(0U, 2U, 3U, 1U), + PermutationVector(2U, 0U, 3U, 1U), + PermutationVector(1U, 0U, 3U, 2U), + PermutationVector(0U, 1U, 3U, 2U), + PermutationVector(3U, 1U, 0U, 2U), + PermutationVector(1U, 3U, 0U, 2U), + PermutationVector(0U, 3U, 1U, 2U), + PermutationVector(3U, 0U, 1U, 2U) + } + }; + + return (permutations2.end() != std::find(permutations2.begin(), permutations2.end(), v)) || (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v)) + || (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v)); +} + +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm) +{ + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_permutation_supported(perm), "PermutationVector not supported."); + + const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm); + + // Validate configured destination + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + } + + return Status{}; +} + +template <typename T> +void run_permute(const Window &window, const ITensor *src, const ITensor *dst, const PermutationVector &perm) +{ + const DataLayout src_layout = src->info()->data_layout(); + + // Source window + Window window_src = window; + + // we only support these two configs in src/core/NEON/kernels/convolution/common/shims.hpp, for all others + // we have to fall back to C++ + if((src_layout == DataLayout::NCHW && perm == PermutationVector{ 2U, 0U, 1U }) || (src_layout == DataLayout::NHWC && perm == PermutationVector{ 1U, 2U, 0U })) + { + window_src.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start())); + window_src.set(Window::DimY, Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start())); + window_src.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start())); + window_src.set(3, Window::Dimension(window[3].start(), window[3].end(), window[3].end() - window[3].start())); + } + + // Destination window + Window window_dst(window); + const Window::Dimension zero_window = Window::Dimension(0, 0, 0); + for(size_t d = 0; d <= dst->info()->num_dimensions(); ++d) + { + window_dst.set(d, zero_window); + } + + // Create iterators + Iterator src_it(src, window_src); + Iterator dst_it(dst, window_dst); + + int in_row_stride = 0; + int in_col_stride = 0; + int in_channel_stride = 0; + int in_batch_stride = 0; + int n_cols = 0; + int n_rows = 0; + int n_channels = 0; + int n_batches = 0; + + switch(src_layout) + { + case DataLayout::NCHW: + { + in_row_stride = src->info()->strides_in_bytes().y() / sizeof(T); + in_channel_stride = src->info()->strides_in_bytes().z() / sizeof(T); + in_batch_stride = src->info()->strides_in_bytes()[3] / sizeof(T); + n_cols = src->info()->tensor_shape().x(); + n_rows = window_src.y().step(); + n_channels = src->info()->tensor_shape().z(); + n_batches = src->info()->tensor_shape()[3]; + break; + } + case DataLayout::NHWC: + { + in_col_stride = src->info()->strides_in_bytes().y() / sizeof(T); + in_row_stride = src->info()->strides_in_bytes().z() / sizeof(T); + in_batch_stride = src->info()->strides_in_bytes()[3] / sizeof(T); + n_channels = src->info()->tensor_shape().x(); + n_cols = window_src.y().step(); + n_rows = src->info()->tensor_shape().z(); + n_batches = src->info()->tensor_shape()[3]; + break; + } + default: + { + ARM_COMPUTE_ERROR("Invalid source data layout."); + break; + } + } + + // CHW -> HWC + if(src_layout == DataLayout::NCHW && perm == PermutationVector{ 2U, 0U, 1U }) + { + const int out_channel_stride = dst->info()->strides_in_bytes().x() / sizeof(T); + const int out_col_stride = dst->info()->strides_in_bytes().y() / sizeof(T); + const int out_row_stride = dst->info()->strides_in_bytes().z() / sizeof(T); + const int out_batch_stride = dst->info()->strides_in_bytes()[3] / sizeof(T); + execute_window_loop(window_src, [&](const Coordinates & id) + { + const int idx = id[0] * out_col_stride + id[1] * out_row_stride + id[2] * out_channel_stride; + reorder::nchw_to_nhwc(reinterpret_cast<const T *>(src_it.ptr()), reinterpret_cast<T *>(dst_it.ptr()) + idx, + n_batches, n_channels, n_rows, n_cols, + in_batch_stride, in_channel_stride, in_row_stride, + out_batch_stride, out_row_stride, out_col_stride); + }, + src_it, dst_it); + } + // HWC -> CHW + else if(src_layout == DataLayout::NHWC && perm == PermutationVector{ 1U, 2U, 0U }) + { + const int out_col_stride = dst->info()->strides_in_bytes().x() / sizeof(T); + const int out_row_stride = dst->info()->strides_in_bytes().y() / sizeof(T); + const int out_channel_stride = dst->info()->strides_in_bytes().z() / sizeof(T); + const int out_batch_stride = dst->info()->strides_in_bytes()[3] / sizeof(T); + execute_window_loop(window_src, [&](const Coordinates & id) + { + const int idx = id[0] * out_channel_stride + id[1] * out_col_stride + id[2] * out_row_stride; + reorder::nhwc_to_nchw(reinterpret_cast<const T *>(src_it.ptr()), reinterpret_cast<T *>(dst_it.ptr()) + idx, + n_batches, n_rows, n_cols, n_channels, + in_batch_stride, in_row_stride, in_col_stride, + out_batch_stride, out_channel_stride, out_row_stride); + }, + src_it, dst_it); + } + else + { + // All other cases fall back to C++ + // Permute strides + Strides strides = dst->info()->strides_in_bytes(); + Strides perm_strides = strides; + permute_strides(perm_strides, perm); + const int perm_stride_3 = src->info()->num_dimensions() >= 4 ? perm_strides[3] : 0; + execute_window_loop(window, [&](const Coordinates & id) + { + const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_stride_3; + *(reinterpret_cast<T *>(dst_it.ptr() + idx)) = *(reinterpret_cast<const T *>(src_it.ptr())); + }, + src_it, dst_it); + } +} +} // namespace + +void CpuPermuteKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm); + // Destination auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, perm)); + + _perm = perm; + + // Configure kernel window + Window win = calculate_max_window(*src, Steps()); + + // This kernel doesn't need padding so update_window_and_padding() can be skipped + + ICpuKernel::configure(win); +} + +Status CpuPermuteKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, perm)); + return Status{}; +} + +void CpuPermuteKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + switch(src->info()->element_size()) + { + case 1: + run_permute<uint8_t>(window, src, dst, _perm); + break; + case 2: + run_permute<uint16_t>(window, src, dst, _perm); + break; + case 4: + run_permute<uint32_t>(window, src, dst, _perm); + break; + default: + ARM_COMPUTE_ERROR("Element size not supported"); + break; + } +} + +const char *CpuPermuteKernel::name() const +{ + return "CpuPermuteKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuPermuteKernel.h b/src/cpu/kernels/CpuPermuteKernel.h new file mode 100644 index 0000000000..1b2672b5b9 --- /dev/null +++ b/src/cpu/kernels/CpuPermuteKernel.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_PERMUTE_KERNEL_H +#define ARM_COMPUTE_CPU_PERMUTE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel to perform tensor permutation given a permutation vector */ +class CpuPermuteKernel : public ICpuKernel +{ +public: + CpuPermuteKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPermuteKernel); + /** Configure kernel for a given list of arguments + * + * @note Arbitrary permutation vectors are supported with rank not greater than 4 + * + * @param[in] src Srouce tensor to permute. Data types supported: All + * @param[out] dst Destination tensor. Data types supported: Same as @p src + * @param[in] perm Permutation vector + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuPermuteKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + PermutationVector _perm{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_PERMUTE_KERNEL_H */ diff --git a/src/cpu/kernels/CpuPool2dKernel.cpp b/src/cpu/kernels/CpuPool2dKernel.cpp new file mode 100644 index 0000000000..d7fb75ee60 --- /dev/null +++ b/src/cpu/kernels/CpuPool2dKernel.cpp @@ -0,0 +1,516 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuPool2dKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/common/Registrars.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/pool2d/neon/list.h" +#include "support/ToolchainSupport.h" + +#include "src/core/NEON/wrapper/wrapper.h" +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +using namespace misc::shape_calculator; + +struct PoolingSelectorData +{ + DataType dt; + DataLayout dl; + int pool_stride_x; + Size2D pool_size; +}; + +using PoolingSelectorPtr = std::add_pointer<bool(const PoolingSelectorData &data)>::type; +using PoolingKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type; +struct PoolingKernel +{ + const char *name; + const PoolingSelectorPtr is_selected; + PoolingKernelPtr ukernel; +}; + +static const PoolingKernel available_kernels[] = +{ + { + "neon_qu8_nhwc_poolMxN", + [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc) + }, + { + "neon_qs8_nhwc_poolMxN", + [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc) + }, +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + { + "neon_f16_nhwc_poolMxN", + [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)); }, + REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc) + }, +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ + { + "neon_fp32_nhwc_poolMxN", + [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); }, + REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc) + }, +#if defined(ENABLE_NCHW_KERNELS) + { + "neon_qu8_nchw_pool2", + [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<uint8_t>) + }, + { + "neon_qu8_nchw_pool3", + [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<uint8_t>) + }, + { + "neon_qu8_nchw_poolMxN", + [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<uint8_t>) + }, + { + "neon_qs8_nchw_pool2", + [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<int8_t>) + }, + { + "neon_qs8_nchw_pool3", + [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<int8_t>) + }, + { + "neon_qs8_nchw_poolMxN", + [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<int8_t>) + }, +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + { + "neon_fp16_nchw_pool2", + [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); }, + REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw) + }, + { + "neon_fp16_nchw_pool3", + [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); }, + REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw) + }, + { + "neon_fp16_nchw_poolMxN", + [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16)); }, + REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw) + }, +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ + { + "neon_fp32_nchw_pool2", + [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); }, + REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw) + }, + { + "neon_fp32_nchw_pool3", + [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); }, + REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw) + }, + { + "neon_fp32_nchw_pool7", + [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7)); }, + REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw) + }, + { + "neon_fp32_nchw_poolMxN", + [](const PoolingSelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); }, + REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw) + }, +#endif /* defined(ENABLE_NCHW_KERNELS) */ +}; + +/** Micro-kernel selector + * + * @param[in] data Selection data passed to help pick the appropriate micro-kernel + * + * @return A matching micro-kernel else nullptr + */ +const PoolingKernel *get_implementation(DataType dt, DataLayout dl, int pool_stride_x, Size2D pool_size) +{ + for(const auto &uk : available_kernels) + { + if(uk.is_selected({ dt, dl, pool_stride_x, pool_size })) + { + return &uk; + } + } + return nullptr; +} + +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, + const ITensorInfo *indices, Size2D pool_size) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(pool_size.x() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(pool_size.y() == 0); + + int pool_stride_x = 0; + int pool_stride_y = 0; + int output_width = 0; + int output_height = 0; + PoolingType pool_type = pool_info.pool_type; + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; + const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], + pool_size.x(), pool_size.y(), pool_info.pad_stride_info); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid"); + + TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type())); + std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); + + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + if(indices) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32, DataType::F16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method"); + } + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(src->data_type())); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding() + && (src->data_layout() == DataLayout::NHWC), + "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types"); + + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info); + if(indices) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2"); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &out_info); + } + } + + const auto *uk = get_implementation(src->data_type(), src->data_layout(), pool_stride_x, pool_size); + ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, const PoolingLayerInfo &pool_info, + unsigned int &num_elems_processed_per_iteration, + BorderSize &border_size, + int pool_size_x, int pool_size_y) +{ + // dst auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info))); + if(indices) + { + // Indices auto inizialitation if not yet initialized + auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src, + pool_info))) + .set_data_type(DataType::U32) /* we store the offset to the element */); + } + const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + unsigned int num_elems_read_per_iteration = 0; + unsigned int num_elems_horizontal_window = 0; + int pool_stride_x = 0; + int pool_stride_y = 0; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int src_width = src->dimension(idx_width); + const int src_height = src->dimension(idx_height); + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; + std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); + const int pool_pad_right = pad_stride_info.pad_right(); + const int pool_pad_top = pad_stride_info.pad_top(); + const int pool_pad_left = pad_stride_info.pad_left(); + const int pool_pad_bottom = pad_stride_info.pad_bottom(); + const bool is_square = pool_size_x == pool_size_y; + const unsigned int pooled_w = dst->dimension(idx_width); + const unsigned int pooled_h = dst->dimension(idx_height); + + //If it's not squared and optimized will be executed the MxN + num_elems_read_per_iteration = 1; + num_elems_processed_per_iteration = 1; + num_elems_horizontal_window = 1; + + if(is_square) + { + switch(src->data_type()) + { + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + switch(pool_size_x) + { + case 2: + num_elems_read_per_iteration = 16; + num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15; + num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16; + break; + case 3: + num_elems_read_per_iteration = 16; + num_elems_processed_per_iteration = (pool_stride_x == 2) ? 7 : 14; + num_elems_horizontal_window = (pool_stride_x == 2) ? 8 : 16; + break; + default: + break; + } + break; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + switch(pool_size_x) + { + case 2: + case 3: + num_elems_read_per_iteration = 4; + num_elems_processed_per_iteration = 1; + num_elems_horizontal_window = 1; + break; + default: + break; + } + break; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + case DataType::F32: + switch(pool_size_x) + { + case 2: + num_elems_read_per_iteration = 2; + break; + case 3: + num_elems_read_per_iteration = 4; // We use vload4 for pooling3 + break; + case 7: + num_elems_read_per_iteration = 8; // We use vload8 for pooling7 + break; + default: + break; + } + num_elems_processed_per_iteration = 1; + num_elems_horizontal_window = 1; + break; + default: + ARM_COMPUTE_ERROR("Element size not supported"); + break; + } + } + + bool window_changed = false; + Window win{}; + if(data_layout == DataLayout::NCHW) + { + // Number of iterations in X dimension + const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration; + // Upper limit for the number of right/bottom border elements that are accessed + const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - src_width; + const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - src_height; + border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left); + border_size.right = std::max(upper_bound_w, pool_pad_right); + border_size.bottom = std::max(upper_bound_h, pool_pad_bottom); + TensorShape dst_shape{ src->tensor_shape() }; + dst_shape.set(0, pooled_w); + dst_shape.set(1, pooled_h); + TensorInfo dst_info(src->clone()->set_tensor_shape(dst_shape)); + win = calculate_max_window(dst_info, Steps(num_elems_processed_per_iteration)); + AccessWindowStatic src_access(src, -pool_pad_left, -pool_pad_top, ceil_to_multiple(src_width + border_size.right, pool_size_x), src_height + border_size.bottom); + AccessWindowHorizontal dst_access(dst, 0, num_elems_horizontal_window); + if(indices) + { + AccessWindowHorizontal indices_access(indices, 0, num_elems_horizontal_window); + window_changed = update_window_and_padding(win, src_access, dst_access, indices_access); + } + else + { + window_changed = update_window_and_padding(win, src_access, dst_access); + } + dst_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape())); + + border_size = src->padding(); + } + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + +BorderSize CpuPool2dKernel::border_size() const +{ + return _border_size; +} + +void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; + const bool is_global_pooling = pool_info.is_global_pooling; + + // Get data layout + const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + // Update pool size in case of global pooling + const Size2D pool_size( + is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width, + is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices, pool_size)); + + const auto *uk = get_implementation(src->data_type(), src->data_layout(), pad_stride_info.stride().first, pool_size); + ARM_COMPUTE_ERROR_ON(uk == nullptr); + + // Set instance variables + _pool_info = pool_info; + _data_layout = src->data_layout(); + _pool_size = pool_size; + _pool_stride_x = pad_stride_info.stride().first; + _run_method = uk->ukernel; + _name = std::string("CpuPool2dKernel").append("/").append(uk->name); + + if(_data_layout == DataLayout::NHWC) + { + // Configure kernel window + Window win = calculate_max_window(*dst, Steps()); + ICpuKernel::configure(win); + } + else + { + // Configure kernel window + auto win_config = validate_and_configure_window(src, dst, indices, pool_info, _num_elems_processed_per_iteration, + _border_size, pool_size.x(), pool_size.y()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICpuKernel::configure(win_config.second); + } +} + +Status CpuPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + + unsigned int num_elems_processed_per_iteration = 0; + BorderSize border_size(0); + + const bool is_global_pooling = pool_info.is_global_pooling; + + // Get data layout + const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + unsigned int pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; + unsigned int pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; + + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices, Size2D(pool_size_x, pool_size_y))); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), + (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration, border_size, + pool_size_x, pool_size_y) + .first); + + return Status{}; +} + +void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); + + const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST_0); + ITensor *indices = tensors.get_tensor(TensorType::ACL_DST_1); + + const unsigned int pool_stride_x = _pool_info.pad_stride_info.stride().first; + const unsigned int pool_stride_y = _pool_info.pad_stride_info.stride().second; + const unsigned int pool_size = _pool_info.pool_size.width; + + Window window_src(window); + if(_data_layout == DataLayout::NCHW) + { + // Set step for src in x and y direction for the src + unsigned int window_x_inc = 0; + switch(src->info()->data_type()) + { + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + { + window_x_inc = pool_stride_x; + if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3) + { + window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration; + } + break; + } + + case DataType::F16: + case DataType::F32: + { + window_x_inc = pool_stride_x; + break; + } + default: + { + ARM_COMPUTE_ERROR("Not supported"); + } + } + window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc)); + window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y)); + } + else + { + window_src.set(Window::DimX, Window::Dimension(0, 1, 1)); + window_src.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x)); + window_src.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y)); + } + _run_method(src, dst, indices, _pool_info, window_src, window); +} + +const char *CpuPool2dKernel::name() const +{ + return _name.c_str(); +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuPool2dKernel.h b/src/cpu/kernels/CpuPool2dKernel.h new file mode 100644 index 0000000000..70fe52d29c --- /dev/null +++ b/src/cpu/kernels/CpuPool2dKernel.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_POOL2D_KERNEL_H +#define ARM_COMPUTE_CPU_POOL2D_KERNEL_H + +#include "arm_compute/core/Types.h" +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the pooling layer kernel */ +class CpuPool2dKernel : public ICpuKernel +{ +public: + CpuPool2dKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dKernel); + /** Configure kernel for a given list of arguments + * + * @note F16 are supported for pool sizes 2 and 3 only + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] dst Destination tensor info. Data types supported: Same as @p src. + * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. + * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32. + */ + void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuPool2dKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + BorderSize border_size() const override; + const char *name() const override; + +private: + using PoolingKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type; + +private: + PoolingLayerInfo _pool_info{}; + DataLayout _data_layout{ DataLayout::UNKNOWN }; + unsigned int _num_elems_processed_per_iteration{ 0 }; + BorderSize _border_size{ 0 }; + Size2D _pool_size{}; + int _pool_stride_x{}; + PoolingKernelPtr _run_method{ nullptr }; + std::string _name{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_POOL2D_KERNEL_H */ diff --git a/src/cpu/kernels/CpuQuantizeKernel.cpp b/src/cpu/kernels/CpuQuantizeKernel.cpp new file mode 100644 index 0000000000..ecae5e7b4e --- /dev/null +++ b/src/cpu/kernels/CpuQuantizeKernel.cpp @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuQuantizeKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "src/core/CPP/Validate.h" + +#include <arm_neon.h> +#include <map> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +constexpr auto window_step = 16; + +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + + return Status{}; +} + +template <typename T> +inline float32x4x4_t load_value(const T *input_ptr) +{ + using Tx16_t = typename wrapper::traits::neon_vector<T, 16>::type; + return arm_compute::convert_to_float32x4x4<Tx16_t>(wrapper::vloadq(input_ptr)); +} + +template <> +inline float32x4x4_t load_value(const float *input_ptr) +{ + return { wrapper::vloadq(input_ptr), + wrapper::vloadq(input_ptr + 4), + wrapper::vloadq(input_ptr + 8), + wrapper::vloadq(input_ptr + 12) }; +} +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> +inline float32x4x4_t load_value(const float16_t *input_ptr) +{ + return { vcvt_f32_f16(wrapper::vload(input_ptr)), + vcvt_f32_f16(wrapper::vload(input_ptr + 4)), + vcvt_f32_f16(wrapper::vload(input_ptr + 8)), + vcvt_f32_f16(wrapper::vload(input_ptr + 12)) }; +} + +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +template <typename element_type> +using vector_type = wrapper::traits::neon_vector_t<element_type, window_step>; + +template <typename quantized_type> +vector_type<quantized_type> vquantize_qasymm8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi); + +template <> +vector_type<uint8_t> vquantize_qasymm8<uint8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) +{ + return vquantize(qv, qi); +} + +template <> +vector_type<int8_t> vquantize_qasymm8<int8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi) +{ + return vquantize_signed(qv, qi); +} + +} // namespace + +void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); + + static const std::map<std::string, QuantizeFunctionExecutorPtr> quant_map = + { + { "op_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, uint8_t> }, + { "op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, int8_t> }, + { "op_QASYMM8_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<uint8_t> }, + + { "op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, uint8_t> }, + { "op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t> }, + { "op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t> }, + + { "op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t> }, + { "op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float, int8_t> }, + { "op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float> }, + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + { "op_F16_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, uint8_t> }, + { "op_F16_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, int8_t> }, + { "op_F16_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float16_t> }, +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/ + }; + + std::string function_to_call("op_"); + function_to_call += string_from_data_type(src->data_type()) + "_"; + function_to_call += string_from_data_type(dst->data_type()); + + auto it = quant_map.find(function_to_call); + + if(it == quant_map.end()) + { + ARM_COMPUTE_ERROR("Unsupported combination of input and output data types"); + } + _func = it->second; + + // Configure kernel window + Window win_config = calculate_max_window(*src, Steps()); + ICpuKernel::configure(win_config); +} + +Status CpuQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst)); + return Status{}; +} + +template <typename TIn, typename TOut> +void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); + UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); + if(is_data_type_quantized_asymmetric(src->info()->data_type())) + { + uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); + } +#ifdef __aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +#else //__aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; +#endif //__aarch64__ + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<const TIn *>(input.ptr()); + auto output_ptr = reinterpret_cast<TOut *>(output.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step); x += window_step) + { + wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo)); + } + // Compute left-over elements + for(; x < window_end_x; ++x) + { + output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy); + } + }, + input, output); +} + +template <typename T> +void CpuQuantizeKernel::run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform(); + UniformQuantizationInfo uqinfo = dst->info()->quantization_info().uniform(); + if(is_data_type_quantized_asymmetric(src->info()->data_type())) + { + uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo); + } +#ifdef __aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN; +#else //__aarch64__ + constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO; +#endif //__aarch64__ + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + auto input_ptr = reinterpret_cast<const T *>(input.ptr()); + auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step); x += window_step) + { + uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo); + vst1q_u16(&output_ptr[x], tmp.val[0]); + vst1q_u16(&output_ptr[x + 8], tmp.val[1]); + } + // Compute left-over elements + for(; x < window_end_x; ++x) + { + output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy); + } + }, + input, output); +} + +void CpuQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + (this->*_func)(src, dst, window); +} + +const char *CpuQuantizeKernel::name() const +{ + return "CpuQuantizeKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/CpuQuantizeKernel.h b/src/cpu/kernels/CpuQuantizeKernel.h new file mode 100644 index 0000000000..eb0814926d --- /dev/null +++ b/src/cpu/kernels/CpuQuantizeKernel.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H +#define ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the quantization layer kernel. + * + * @note The implementation supports only 3D input tensors + */ +class CpuQuantizeKernel : public ICpuKernel +{ +public: + CpuQuantizeKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuQuantizeKernel); + /** Set the input, output. + * + * @param[in] src Source tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16. + * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16. + * + * @note Output auto initialization is not supported by this kernel + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuQuantizeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + /** Common signature for all the specialised @ref CpuQuantizeKernel functions + * + * @param[in] window Region on which to execute the kernel. + */ + using QuantizeFunctionExecutorPtr = void (CpuQuantizeKernel::*)(const ITensor *src, ITensor *dst, const Window &window); + /** Function to apply QASYMM8 or QASYMM8_SIGNED quantization on a tensor. + * + * @param[in] window Region on which to execute the kernel. + */ + template <typename TIn, typename TOut> + void run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window); + /** Function to apply QASYMM16 quantization on a tensor. + * + * @param[in] window Region on which to execute the kernel. + */ + template <typename T> + void run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window); + + QuantizeFunctionExecutorPtr _func{ nullptr }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_QUANTIZE_KERNEL_H */ diff --git a/src/cpu/kernels/CpuReshapeKernel.cpp b/src/cpu/kernels/CpuReshapeKernel.cpp new file mode 100644 index 0000000000..3bbcc09cc5 --- /dev/null +++ b/src/cpu/kernels/CpuReshapeKernel.cpp @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuReshapeKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/INEKernel.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <cstdint> + +/** [NEReshapeLayerKernel Kernel] **/ +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + + if(dst->tensor_shape().total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() != dst->tensor_shape().total_size()); + } + + return Status{}; +} + +template <typename T> +inline void reshape_tensor(const Window &window, const ITensor *src, ITensor *dst) +{ + const TensorShape &src_shape = src->info()->tensor_shape(); + const TensorShape &dst_shape = dst->info()->tensor_shape(); + Coordinates dst_coord{}; + + Iterator src_it(src, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + dst_coord = index2coords(dst_shape, coords2index(src_shape, id)); + *reinterpret_cast<T *>(dst->ptr_to_element(dst_coord)) = *reinterpret_cast<T *>(src_it.ptr()); + }, + src_it); +} +} // namespace + +void CpuReshapeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); + ARM_COMPUTE_UNUSED(dst); + + // Configure kernel window + Window win = calculate_max_window(*src); + + ICpuKernel::configure(win); +} + +Status CpuReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst)); + + return Status{}; +} + +void CpuReshapeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + switch(src->info()->data_type()) + { + case DataType::U8: + case DataType::S8: + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + reshape_tensor<uint8_t>(window, src, dst); + break; + case DataType::U16: + case DataType::S16: + case DataType::F16: + reshape_tensor<uint16_t>(window, src, dst); + break; + case DataType::U32: + case DataType::S32: + case DataType::F32: + reshape_tensor<uint32_t>(window, src, dst); + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type!"); + } +} + +const char *CpuReshapeKernel::name() const +{ + return "CpuReshapeKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +/** [NEReshapeLayerKernel Kernel] **/ diff --git a/src/cpu/kernels/CpuReshapeKernel.h b/src/cpu/kernels/CpuReshapeKernel.h new file mode 100644 index 0000000000..9fe4350445 --- /dev/null +++ b/src/cpu/kernels/CpuReshapeKernel.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_RESHAPE_KERNEL_H +#define ARM_COMPUTE_CPU_RESHAPE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the kernel to perform tensor reshaping */ +class CpuReshapeKernel : public ICpuKernel +{ +public: + CpuReshapeKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuReshapeKernel); + /** Configure kernel for a given list of arguments + * + * @param[in] src Source tensor info. Data type supported: All + * @param[out] dst Destination tensor info. Data type supported: Same as @p input + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuReshapeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_RESHAPE_KERNEL_H */ diff --git a/src/cpu/kernels/CpuScaleKernel.cpp b/src/cpu/kernels/CpuScaleKernel.cpp new file mode 100644 index 0000000000..1108c7a78e --- /dev/null +++ b/src/cpu/kernels/CpuScaleKernel.cpp @@ -0,0 +1,623 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuScaleKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/Utility.h" +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/common/Registrars.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/ScaleHelpers.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/utils/ScaleUtils.h" +#include "src/cpu/kernels/scale/neon/list.h" +#include "src/cpu/kernels/scale/sve/list.h" +#include "support/Rounding.h" + +#include <arm_neon.h> +#include <map> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +struct ScaleSelectorData +{ + DataType dt; + const CPUInfo &ci; +}; +using ScaleSelectorPtr = std::add_pointer<bool(const ScaleSelectorData &data)>::type; +using ScaleKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, + InterpolationPolicy, BorderMode, PixelValue, float, bool, const Window &)>::type; +struct ScaleKernel +{ + const char *name; + const ScaleSelectorPtr is_selected; + ScaleKernelPtr ukernel; +}; + +static const ScaleKernel available_kernels[] = +{ +#if defined(ARM_COMPUTE_ENABLE_SVE) + { + "sve_fp16_scale", + [](const ScaleSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, + REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale) + }, + { + "sve_fp32_scale", + [](const ScaleSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, + REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale) + }, + { + "sve_qu8_scale", + [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8 && data.ci.has_sve(); }, + REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale) + }, + { + "sve_qs8_scale", + [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.ci.has_sve(); }, + REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale) + }, + { + "sve_u8_scale", + [](const ScaleSelectorData & data) { return data.dt == DataType::U8 && data.ci.has_sve(); }, + REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale) + }, + { + "sve_s16_scale", + [](const ScaleSelectorData & data) { return data.dt == DataType::S16 && data.ci.has_sve(); }, + REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ +#if defined(ARM_COMPUTE_ENABLE_NEON) +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + { + "neon_fp16_scale", + [](const ScaleSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_fp16(); }, + REGISTER_FP16_NEON(arm_compute::cpu::common_neon_scale<float16_t>) + }, +#endif /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ + { + "neon_fp32_scale", + [](const ScaleSelectorData & data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale<float>) + }, + { + "neon_qu8_scale", + [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale) + }, + { + "neon_qs8_scale", + [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale) + }, + { + "neon_u8_scale", + [](const ScaleSelectorData & data) { return data.dt == DataType::U8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::common_neon_scale<uint8_t>) + }, + { + "neon_s16_scale", + [](const ScaleSelectorData & data) { return data.dt == DataType::S16; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::common_neon_scale<int16_t>) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ +}; + +/** Micro-kernel selector + * + * @param[in] data Selection data passed to help pick the appropriate micro-kernel + * + * @return A matching micro-kernel else nullptr + */ +const ScaleKernel *get_implementation(const ScaleSelectorData &data) +{ + for(const auto &uk : available_kernels) + { + if(uk.is_selected(data)) + { + return &uk; + } + } + return nullptr; +} + +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, + const ITensorInfo *offsets, ITensorInfo *dst, const ScaleKernelInfo &info) +{ + const auto *uk = get_implementation(ScaleSelectorData{ src->data_type(), CPUInfo::get() }); + ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(dst == src); + ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT); + ARM_COMPUTE_UNUSED(info.constant_border_value); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.use_padding, "Padding is not supported"); + + const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; + const auto width_index = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const auto height_index = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const auto output_width = dst->dimension(width_index); + const auto output_height = dst->dimension(height_index); + ARM_COMPUTE_RETURN_ERROR_ON(output_width == 0); + ARM_COMPUTE_RETURN_ERROR_ON(output_height == 0); + + if(info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32); + } + + if(info.interpolation_policy == InterpolationPolicy::BILINEAR) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32); + if(dx != nullptr && dy != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32); + } + } + + ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy)); + + if(info.interpolation_policy == InterpolationPolicy::AREA) + { + ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8); + } + + return Status{}; +} +} // namespace + +void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, + ITensorInfo *dst, const ScaleKernelInfo &info) +{ + ARM_COMPUTE_UNUSED(dx, dy, offsets); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, + dx, + dy, + offsets, + dst, + info)); + + const auto *uk = get_implementation(ScaleSelectorData{ src->data_type(), CPUInfo::get() }); + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + + _run_method = uk->ukernel; + _name = std::string("CpuScaleKernel").append("/").append(uk->name).append("_").append(string_from_interpolation_policy(info.interpolation_policy)); + + // Get data layout and width/height indices + _data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; + const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + + _policy = info.interpolation_policy; + _border_mode = info.border_mode; + _constant_border_value = info.constant_border_value; + _align_corners = info.align_corners; + + if(info.sampling_policy == SamplingPolicy::CENTER) + { + _sampling_offset = 0.5f; + } + + // Compute the ratio between source width/height and destination width/height + const auto wr = scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), _align_corners); + const auto hr = scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), _align_corners); + + // Area interpolation behaves as Nearest Neighbour in case of up-sampling + _policy = (_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : _policy; + + if(_border_mode == BorderMode::UNDEFINED) + { + _border_mode = BorderMode::CONSTANT; + _constant_border_value = PixelValue(); + } + +#ifdef ENABLE_NCHW_KERNELS + // Configure scale function to run + if(_data_layout == DataLayout::NCHW) + { + std::string function_to_call("scale_"); + function_to_call += string_from_data_type(src->data_type()) + "_"; + function_to_call += string_from_data_layout(_data_layout) + "_"; + function_to_call += string_from_interpolation_policy(_policy); + + static std::map<std::string, ScaleFunctionPtr> map_function = + { + { "scale_U8_NCHW_AREA_CONSTANT", &CpuScaleKernel::scale_area_nchw_u8 }, + + { "scale_U8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<uint8_t> }, + { "scale_U8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t> }, + + { "scale_QASYMM8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<uint8_t> }, + { "scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t> }, + + { "scale_QASYMM8_SIGNED_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<int8_t> }, + { "scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int8_t> }, + + { "scale_S16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<int16_t> }, + { "scale_S16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int16_t> }, + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + { "scale_F16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float16_t> }, + { "scale_F16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float16_t> }, +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + + { "scale_F32_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float> }, + { "scale_F32_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float> }, + }; + auto it = map_function.find(function_to_call); + if(it != map_function.end()) + { + _func = it->second; + } + } +#endif // ENABLE_NCHW_KERNELS + + // Configure window + Window win = calculate_max_window(*dst, Steps()); + ICpuKernel::configure(win); +} + +#ifdef ENABLE_NCHW_KERNELS +template <typename T> +void CpuScaleKernel::scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window) +{ + ARM_COMPUTE_UNUSED(dx, dy); + const size_t in_stride_x = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners); + + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + + // Set offsets window + Window win_off; + win_off.set(Window::DimX, window[Window::DimX]); + win_off.set(Window::DimY, window[Window::DimY]); + for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) + { + win_off.set(d, Window::Dimension(0, 0, 0)); + } + + // Create iterators + Iterator src_i(src, win_in); + Iterator dst_i(dst, window); + Iterator offsets_i(offsets, win_off); + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets_i.ptr()); + const auto in_yi = static_cast<int32_t>(_align_corners ? utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor(( + id.y() + _sampling_offset) + * hr)); + const int32_t offset_row = in_yi * in_stride_x; + *reinterpret_cast<T *>(dst_i.ptr()) = *(reinterpret_cast<const T *>(src_i.ptr()) + offsets_ptr[0] + offset_row); + }, + src_i, offsets_i, dst_i); +} + +template <typename T> +void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window) +{ + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners); + Window win_off; + win_off.set(Window::DimX, window.x()); + win_off.set(Window::DimY, window.y()); + + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + + for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) + { + win_off.set(d, Window::Dimension(0, 0, 0)); + } + + Iterator src_i(src, win_in); + Iterator dst_i(dst, window); + Iterator offsets_i(offsets, win_off); + Iterator dx_i(dx, win_off); + Iterator dy_i(dy, win_off); + + const int32_t in_dim_w = src->info()->dimension(0); + const int32_t in_dim_h = src->info()->dimension(1); + const int32_t in_stride_w = in_dim_w + src->info()->padding().left + src->info()->padding().right; + + if(_border_mode == BorderMode::CONSTANT) + { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type; +#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + using ConstType = T; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>()); + execute_window_loop(window, [&](const Coordinates & id) + { + const int32_t index_h = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset); + const auto index_w = *(reinterpret_cast<const int32_t *>(offsets_i.ptr())); + const auto dx_val = *(reinterpret_cast<const float *>(dx_i.ptr())); + const auto dy_val = *(reinterpret_cast<const float *>(dy_i.ptr())); + const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr()); + + const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + index_h * in_stride_w)) : const_border_value; + const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w)) : const_border_value; + const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h + && index_h < in_dim_h - 1) ? + (*(pixel_row_ptr + index_w + index_h * in_stride_w + in_stride_w)) : + const_border_value; + const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h + && index_h < in_dim_h - 1) ? + (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w + in_stride_w)) : + const_border_value; + + *reinterpret_cast<T *>(dst_i.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + src_i, offsets_i, dx_i, dy_i, dst_i); + } + else if(_border_mode == BorderMode::REPLICATE) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const int index_h = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset); + const auto index_w = *(reinterpret_cast<const int32_t *>(offsets_i.ptr())); + const auto dx_val = *(reinterpret_cast<const float *>(dx_i.ptr())); + const auto dy_val = *(reinterpret_cast<const float *>(dy_i.ptr())); + const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr()); + + auto clamped_x = utility::clamp<int>(index_w, 0, in_dim_w - 1); + auto clamped_x1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1); + auto clamped_y = utility::clamp<int>(index_h, 0, in_dim_h - 1); + auto clamped_y1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1); + + const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_stride_w); + const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_stride_w); + const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_stride_w); + const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_stride_w); + + *reinterpret_cast<T *>(dst_i.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + src_i, offsets_i, dx_i, dy_i, dst_i); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} + +void CpuScaleKernel::scale_area_nchw_u8(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window) +{ + ARM_COMPUTE_UNUSED(dx, dy, offsets); + using namespace scale_helpers; + + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8); + + // Don't increment in width/height/channels for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Iterator src_i(src, win_in); + Iterator dst_i(dst, window); + + const auto wr = scale_utils::calculate_resize_ratio(src->info()->dimension(0), dst->info()->dimension(0), _align_corners); + const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners); + const auto w = src->info()->dimension(0); + const auto h = src->info()->dimension(1); + const size_t in_stride = src->info()->strides_in_bytes()[1]; + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto in_ptr = reinterpret_cast<const uint8_t *>(src_i.ptr()); + + uint8x8_t tmp0 = vdup_n_u8(0); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7); + + uint8x8_t tmp1 = vdup_n_u8(0); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7); + + vst1q_u8(dst_i.ptr(), vcombine_u8(tmp0, tmp1)); + }, + src_i, dst_i); +} + +template <typename T> +void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window) +{ + // Get data layout and width/height indices + const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), _align_corners); + Window win_off; + win_off.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_off.set(Window::DimY, Window::Dimension(0, 0, 0)); + + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(idx_width, Window::Dimension(0, 0, 0)); + win_in.set(idx_height, Window::Dimension(0, 0, 0)); + + for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) + { + win_off.set(d, Window::Dimension(0, 0, 0)); + } + + Iterator src_i(src, win_in); + Iterator dst_i(dst, window); + + const int32_t in_dim_w = src->info()->dimension(idx_width); + const int32_t in_dim_h = src->info()->dimension(idx_height); + const int32_t stride_w = src->info()->strides_in_bytes()[idx_width]; + const int32_t stride_h = src->info()->strides_in_bytes()[idx_height]; + + const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + if(_border_mode == BorderMode::CONSTANT) + { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type; +#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + using ConstType = T; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>()); + execute_window_loop(window, [&](const Coordinates & id) + { + const int32_t index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset); + const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr()); + + const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? + (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) : + const_border_value; + const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? + (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) : + const_border_value; + const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ? + (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) : + const_border_value; + const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ? + (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) : + const_border_value; + + const float inp00 = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info); + *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + src_i, dst_i); + } + else if(_border_mode == BorderMode::REPLICATE) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const int index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset); + const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr()); + + auto clamped_w = utility::clamp<int>(index_w, 0, in_dim_w - 1); + auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1); + auto clamped_h = utility::clamp<int>(index_h, 0, in_dim_h - 1); + auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1); + + const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h); + const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h); + const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h); + const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h); + + const float inp00 = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info); + *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + src_i, dst_i); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} +#endif // ENABLE_NCHW_KERNELS + +Status CpuScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy, + const ITensorInfo *offsets, ITensorInfo *output, const ScaleKernelInfo &info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, info)); + return Status{}; +} + +void CpuScaleKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr && _data_layout == DataLayout::NCHW); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr && _data_layout == DataLayout::NHWC); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + const auto dx = tensors.get_const_tensor(TensorType::ACL_INT_0); + const auto dy = tensors.get_const_tensor(TensorType::ACL_INT_1); + const auto offsets = tensors.get_const_tensor(TensorType::ACL_INT_2); + + if(_data_layout == DataLayout::NCHW) + { + (this->*_func)(src, dst, dx, dy, offsets, window); + } + else + { + _run_method(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset, _align_corners, window); + } +} + +const char *CpuScaleKernel::name() const +{ + return _name.c_str(); +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuScaleKernel.h b/src/cpu/kernels/CpuScaleKernel.h new file mode 100644 index 0000000000..913b5a5593 --- /dev/null +++ b/src/cpu/kernels/CpuScaleKernel.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_SCALEKERNEL_H +#define ARM_COMPUTE_CPU_SCALEKERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Arm(R) Neon(TM) kernel to perform scaling on a tensor */ +class CpuScaleKernel : public ICpuKernel +{ +public: + CpuScaleKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuScaleKernel); + /** Initialise the kernel's inputs, output and interpolation policy + * + * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor + * @note Using @p policy Area only supports data layout NCHW and input data type U8. + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. + * @param[in] dx Distance x tensor info. Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32 + * @param[in] dy Distance y tensor info. Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32 + * @param[in] offsets Offset tensor info. Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32. + * @param[out] dst Destination tensor info. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. + * @param[in] info @ref ScaleKernelInfo to use for configuration + */ + void configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst, + const ScaleKernelInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuScaleKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst, + const ScaleKernelInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: +#ifdef ENABLE_NCHW_KERNELS + /** function to perform scale using area interpolation on the given window + * + * @note Used only in case down-sampling. + */ + void scale_area_nchw_u8(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window); + + /** function to perform scale using bilinear interpolation on the given window */ + template <typename T> + void scale_bilinear_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window); + /** function to perform scale using bilinear interpolation on the given window */ + template <typename T> + void scale_bilinear_qasymm(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window); + + /** function to perform scale using nearest neighbour on the given window */ + template <typename T> + void scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window); +#endif // ENABLE_NCHW_KERNELS + + /** Scale function to use for the particular function to use */ + using ScaleFunctionPtr = void (CpuScaleKernel::*)(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const Window &window); + using ScaleKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, + InterpolationPolicy, BorderMode, PixelValue, float, bool, const Window &)>::type; + + ScaleFunctionPtr _func{ nullptr }; + InterpolationPolicy _policy{}; + BorderMode _border_mode{}; + PixelValue _constant_border_value{}; + float _sampling_offset{ 0 }; + bool _align_corners{ false }; + DataLayout _data_layout{ DataLayout::UNKNOWN }; + ScaleKernelPtr _run_method{ nullptr }; + std::string _name{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_SCALEKERNEL_H */ diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp new file mode 100644 index 0000000000..cbf3773ddc --- /dev/null +++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp @@ -0,0 +1,378 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuSoftmaxKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "src/core/common/Registrars.h" +#include "src/cpu/kernels/softmax/impl/neon/list.h" +#include "src/cpu/kernels/softmax/impl/sve/list.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +struct SoftmaxSelectorData +{ + DataType dt; + const CPUInfo &ci; +}; +using SoftmaxSelectorPtr = std::add_pointer<bool(const SoftmaxSelectorData &data)>::type; +using SoftmaxLogits1DMaxKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &)>::type; +using SoftmaxLogits1DKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type; + +struct SoftmaxLogits1DKernel +{ + const char *name; + const SoftmaxSelectorPtr is_selected; + SoftmaxLogits1DKernelPtr ukernel; +}; + +struct SoftmaxLogits1DMaxKernel +{ + const char *name; + const SoftmaxSelectorPtr is_selected; + SoftmaxLogits1DMaxKernelPtr ukernel; +}; + +static const SoftmaxLogits1DKernel available_logits_1d_kernels[] = +{ +#if defined(ARM_COMPUTE_ENABLE_SVE) + { + "sve_fp32_softmax_logits_1d", + [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32) && data.ci.has_sve(); }, + REGISTER_FP32_SVE(arm_compute::cpu::sve_softmax_logits_1d_float<float>) + }, + { + "sve_fp16_softmax_logits_1d", + [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16) && data.ci.has_sve(); }, + REGISTER_FP16_SVE(arm_compute::cpu::sve_softmax_logits_1d_float<float16_t>) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ + +#if defined(ARM_COMPUTE_ENABLE_NEON) + { + "neon_fp32_softmax_logits_1d", + [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_softmax_logits_1d_float<float>) + }, +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + { + "neon_fp16_softmax_logits_1d", + [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_softmax_logits_1d_float<float16_t>) + }, +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ +#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ + +#if defined(ARM_COMPUTE_ENABLE_SVE2) + { + "sve2_qu8_softmax_logits_1d", + [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8) && data.ci.has_sve2(); }, + REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized<qasymm8_t>) + }, + { + "sve2_qs8_softmax_logits_1d", + [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve2(); }, + REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_softmax_logits_1d_quantized<qasymm8_signed_t>) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ + { + "neon_qu8_softmax_logits_1d", + [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized<qasymm8_t>) + }, + { + "neon_qs8_softmax_logits_1d", + [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_softmax_logits_1d_quantized<qasymm8_signed_t>) + }, +}; + +static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] = +{ +#if defined(ARM_COMPUTE_ENABLE_SVE) + { + "sve_fp32_logits_1d_max", + [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32) && data.ci.has_sve(); }, + REGISTER_FP32_SVE(arm_compute::cpu::sve_logits_1d_max<float>) + }, + { + "sve_fp16_logits_1d_max", + [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16) && data.ci.has_sve(); }, + REGISTER_FP16_SVE(arm_compute::cpu::sve_logits_1d_max<float16_t>) + }, + { + "sve_qu8_logits_1d_max", + [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8) && data.ci.has_sve(); }, + REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_logits_1d_max<qasymm8_t>) + }, + { + "sve_qs8_logits_1d_max", + [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve(); }, + REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_logits_1d_max<qasymm8_signed_t>) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ +#if defined(ARM_COMPUTE_ENABLE_NEON) + { + "neon_fp32_logits_1d_max", + [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_logits_1d_max<float>) + }, +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + { + "neon_fp16_logits_1d_max", + [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_logits_1d_max<float16_t>) + }, +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ + { + "neon_qu8_logits_1d_max", + [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_logits_1d_max<qasymm8_t>) + }, + { + "neon_qs8_logits_1d_max", + [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_logits_1d_max<qasymm8_signed_t>) + }, +#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */ +}; + +const SoftmaxLogits1DKernel *get_implementation_logits(const SoftmaxSelectorData &data) +{ + for(const auto &uk : available_logits_1d_kernels) + { + if(uk.is_selected({ data.dt, CPUInfo::get() })) + { + return &uk; + } + } + return nullptr; +} + +const SoftmaxLogits1DMaxKernel *get_implementation_logits_max(const SoftmaxSelectorData &data) +{ + for(const auto &uk : available_logits_1d_max_kernels) + { + if(uk.is_selected({ data.dt, CPUInfo::get() })) + { + return &uk; + } + } + return nullptr; +} + +Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + + // Validate in case of configured output + if(output.total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1)); + } + + return Status{}; +} + +} // namespace + +void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*src, *dst)); + + // Softmax across the x dimension + const TensorShape output_shape = TensorShape(src->tensor_shape()).set(0, 1); + // Output auto initialization if not yet initialized + auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info()); + + const auto *uk = get_implementation_logits_max(SoftmaxSelectorData{ src->data_type(), CPUInfo::get() }); + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + + _run_method = uk->ukernel; + _name = std::string("CpuLogits1DMaxKernel").append("/").append(uk->name); + + Window win = calculate_max_window(*src, Steps()); + ICpuKernel::configure(win); +} + +Status CpuLogits1DMaxKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*src, *dst)); + + return Status{}; +} + +void CpuLogits1DMaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + _run_method(src, dst, window); +} + +const char *CpuLogits1DMaxKernel::name() const +{ + return _name.c_str(); +} + +namespace +{ +Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorInfo &max, + const ITensorInfo &dst, const float beta, const ITensorInfo &tmp, bool is_log) +{ + ARM_COMPUTE_UNUSED(beta); + // Check input + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + + const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type()); + + // Check max + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(src.tensor_shape()).set(0, 1), max.tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max); + + // Check output if configured + if(dst.total_size() != 0) + { + const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log) : dst.quantization_info(); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst); + ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != output_quantization); + } + + // Check tmp if configured + if(tmp.total_size() != 0) + { + const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src.data_type(); + ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type); + // We could potentially reduce tmp memory if we could predict or make an assumption + // on the maximum number of threads that will run in parallel. + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &tmp); + } + + return Status{}; +} +} // namespace + +template <bool IS_LOG> +void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG)); + + // Configure kernel window + const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type()); + + // Output auto initialization if not yet initialized + const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG) : dst->quantization_info(); + auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding()); + + // Tmp auto initialization if not yet initialized + const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type(); + auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding()); + + const auto *uk = get_implementation_logits(SoftmaxSelectorData{ src->data_type(), CPUInfo::get() }); + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + + std::string kernel_name = IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel"); + + _beta = beta; + _run_method = uk->ukernel; + _name = kernel_name.append("/").append(uk->name); + + // Configure kernel window + Window win = calculate_max_window(*max, Steps()); + + ICpuKernel::configure(win); +} + +template <bool IS_LOG> +Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate(const ITensorInfo *src, const ITensorInfo *max, + const ITensorInfo *dst, const float beta, const ITensorInfo *tmp) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG)); + + return Status{}; +} + +template <bool IS_LOG> +void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto max = tensors.get_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST_0); + auto tmp = tensors.get_tensor(TensorType::ACL_DST_1); + + const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x(); + const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration; + + ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread)); + + void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread); + _run_method(src, max, tmp_for_thread, dst, _beta, IS_LOG, window); +} + +template <bool IS_LOG> +const char *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const +{ + return _name.c_str(); +} + +template class CpuLogits1DSoftmaxKernel<true>; +template class CpuLogits1DSoftmaxKernel<false>; + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h new file mode 100644 index 0000000000..8073a677d9 --- /dev/null +++ b/src/cpu/kernels/CpuSoftmaxKernel.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H +#define ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the identifying the max value of 1D Logits */ +class CpuLogits1DMaxKernel : public ICpuKernel +{ +public: + CpuLogits1DMaxKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DMaxKernel); + /** Set the input and output tensors. + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p input + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuLogits1DMaxKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + using SoftmaxLogits1DMaxKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &)>::type; + +private: + SoftmaxLogits1DMaxKernelPtr _run_method{ nullptr }; + std::string _name{}; +}; + +/** Interface for softmax computation for QASYMM8 with pre-computed max. */ +template <bool IS_LOG = false> +class CpuLogits1DSoftmaxKernel : public ICpuKernel +{ +public: + CpuLogits1DSoftmaxKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DSoftmaxKernel); + + /** Set the input and output tensors. + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] max Max values tensor info. Same shape as input with dimension 0 set to 1. + * Data types supported: same as @p input. + * @param[out] dst Destination tensor info. Data types supported: same as @p input. + * @param[in] beta A scaling factor for the exponent. + * + * @param tmp Auxiliary tensor info. Must be type F32 and same shape as the input. + */ + void configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuLogits1DSoftmaxKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *max, + const ITensorInfo *dst, const float beta, const ITensorInfo *tmp); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + using SoftmaxLogits1DKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type; + +private: + float _beta{ 1.0f }; + SoftmaxLogits1DKernelPtr _run_method{ nullptr }; + std::string _name{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H */ diff --git a/src/cpu/kernels/CpuSubKernel.cpp b/src/cpu/kernels/CpuSubKernel.cpp new file mode 100644 index 0000000000..ec65f12dfc --- /dev/null +++ b/src/cpu/kernels/CpuSubKernel.cpp @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuSubKernel.h" + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "src/core/CPP/Validate.h" +#include "src/core/common/Registrars.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/sub/neon/list.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +struct SubSelectorData +{ + DataType dt; +}; + +using SubSelectorPtr = std::add_pointer<bool(const SubSelectorData &data)>::type; +using SubKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type; + +struct SubKernel +{ + const char *name; + const SubSelectorPtr is_selected; + SubKernelPtr ukernel; +}; + +static const SubKernel available_kernels[] = +{ + { + "neon_fp32_sub", + [](const SubSelectorData & data) { return (data.dt == DataType::F32); }, + REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon<float>) + }, +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + { + "neon_fp16_sub", + [](const SubSelectorData & data) { return (data.dt == DataType::F16); }, + REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon<float16_t>) + }, +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */ + { + "neon_u8_sub", + [](const SubSelectorData & data) { return (data.dt == DataType::U8); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<uint8_t>) + }, + { + "neon_s16_sub", + [](const SubSelectorData & data) { return (data.dt == DataType::S16); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int16_t>) + }, + { + "neon_s32_sub", + [](const SubSelectorData & data) { return (data.dt == DataType::S32); }, + REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int32_t>) + }, + { + "neon_qu8_sub", + [](const SubSelectorData & data) { return (data.dt == DataType::QASYMM8); }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon) + }, + { + "neon_qs8_sub", + [](const SubSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon) + }, + { + "neon_qs16_sub", + [](const SubSelectorData & data) { return (data.dt == DataType::QSYMM16); }, + REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon) + }, +}; + +/** Micro-kernel selector + * + * @param[in] data Selection data passed to help pick the appropriate micro-kernel + * + * @return A matching micro-kernel else nullptr + */ +const SubKernel *get_implementation(DataType dt) +{ + for(const auto &uk : available_kernels) + { + if(uk.is_selected({ dt })) + { + return &uk; + } + } + return nullptr; +} + +inline Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy) +{ + ARM_COMPUTE_UNUSED(policy); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1); + + const auto *uk = get_implementation(src0.data_type()); + ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); + + const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(src0.data_type()) && (policy == ConvertPolicy::WRAP), + "Convert policy cannot be WRAP if datatype is quantized"); + + // Validate in case of configured dst + if(dst.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), + "Wrong shape for dst"); + } + return Status{}; +} +} // namespace + +void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy)); + + const TensorShape &out_shape = TensorShape::broadcast_shape(src0->tensor_shape(), src1->tensor_shape()); + + // Auto initialize dst if not initialized + set_shape_if_empty(*dst, out_shape); + set_data_type_if_unknown(*dst, src0->data_type()); + + const auto *uk = get_implementation(src0->data_type()); + ARM_COMPUTE_ERROR_ON_NULLPTR(uk); + + _policy = policy; + _run_method = uk->ukernel; + _name = std::string("CpuSubKernel").append("/").append(uk->name); + + // CpuSubKernel doesn't need padding so update_window_and_padding() can be skipped + Window win = calculate_max_window(out_shape, Steps()); + + ICpuKernel::configure(win); +} + +Status CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy)); + + return Status{}; +} + +void CpuSubKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_run_method == nullptr); + + const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); + + _run_method(src0, src1, dst, _policy, window); +} + +const char *CpuSubKernel::name() const +{ + return _name.c_str(); +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuSubKernel.h b/src/cpu/kernels/CpuSubKernel.h new file mode 100644 index 0000000000..80d6be68b5 --- /dev/null +++ b/src/cpu/kernels/CpuSubKernel.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_SUB_KERNEL_H +#define ARM_COMPUTE_CPU_SUB_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Interface for the kernel to perform subtraction between two tensors */ +class CpuSubKernel : public ICpuKernel +{ +public: + CpuSubKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuSubKernel); + + /** Initialise the kernel's src and dst. + * + * Valid configurations (src0,src1) -> dst : + * + * - (U8,U8) -> U8 + * - (QASYMM8, QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (S16,S16) -> S16 + * - (S32,S32) -> S32 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * + * @param[in] src0 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 + * @param[in] src1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 + * @param[out] dst The dst tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32. + * @param[in] policy Overflow policy. Convert policy cannot be WRAP if datatype is quantized. + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuSubKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + +private: + using SubKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type; + +private: + ConvertPolicy _policy{}; + SubKernelPtr _run_method{ nullptr }; + std::string _name{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_SUB_KERNEL_H */ diff --git a/src/cpu/kernels/CpuTransposeKernel.cpp b/src/cpu/kernels/CpuTransposeKernel.cpp new file mode 100644 index 0000000000..2f981c15e4 --- /dev/null +++ b/src/cpu/kernels/CpuTransposeKernel.cpp @@ -0,0 +1,510 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuTransposeKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +unsigned int num_elems_processed(size_t element_size) +{ + switch(element_size) + { + case 1: + return 8; + case 2: + case 4: + return 4; + default: + break; + } + + ARM_COMPUTE_ERROR("Element size not supported"); +} + +void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &window) +{ + const int window_step_x = 8; + const int window_step_y = 8; + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_start_y = window.y().start(); + const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1))); + const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y; + const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1]; + const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1]; + + // Check if we need a left-over loop for the y dimension + bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0); + + Window window_in(window); + window_in.set(Window::DimX, Window::Dimension(0, 1, 1)); + if(left_over_loop_y) + { + // Check if window_end_y_multiple_of is greater than window_start_y + if(window_end_y_multiple_of > window_start_y) + { + window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y)); + } + else + { + window_in.set(Window::DimY, Window::Dimension(0, 0, 1)); + } + } + + Window window_out(window); + window_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator output(out, window_out); + + // Run the SIMD path if and only if the input is not a row-vector + if(in->info()->dimension(1) != 1) + { + Iterator input(in, window_in); + execute_window_loop(window_in, [&](const Coordinates & id) + { + // Compute 8x8 elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x8_t row0 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 0 * input_stride_in_bytes)); + const uint8x8_t row1 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 1 * input_stride_in_bytes)); + const uint8x8_t row2 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 2 * input_stride_in_bytes)); + const uint8x8_t row3 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 3 * input_stride_in_bytes)); + const uint8x8_t row4 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 4 * input_stride_in_bytes)); + const uint8x8_t row5 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 5 * input_stride_in_bytes)); + const uint8x8_t row6 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 6 * input_stride_in_bytes)); + const uint8x8_t row7 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 7 * input_stride_in_bytes)); + + // Transpose 2x2 + const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1); + const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3); + const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5); + const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7); + + // Transpose 4x4 + const uint16x4x2_t k0_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0])); + const uint16x4x2_t k1_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1])); + const uint16x4x2_t k2_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0])); + const uint16x4x2_t k3_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1])); + + // Transpose 8x8 + const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0])); + const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1])); + const uint32x2x2_t k2_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0])); + const uint32x2x2_t k3_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1])); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes; + + vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0]))); + vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0]))); + vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0]))); + vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0]))); + vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1]))); + vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1]))); + vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1]))); + vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1]))); + } + + // Compute left-over elements along the x dimension (1x8) + for(; x < window_end_x; ++x) + { + const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes); + const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes); + const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes); + const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes); + const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes); + const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes); + const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes); + const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes); + + uint8x8_t result = vdup_n_u8(0); + result = vset_lane_u8(val0, result, 0); + result = vset_lane_u8(val1, result, 1); + result = vset_lane_u8(val2, result, 2); + result = vset_lane_u8(val3, result, 3); + result = vset_lane_u8(val4, result, 4); + result = vset_lane_u8(val5, result, 5); + result = vset_lane_u8(val6, result, 6); + result = vset_lane_u8(val7, result, 7); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes; + + vst1_u8(output.ptr() + dst_offset_in_bytes, result); + } + }, + input, output); + } + + if(left_over_loop_y) + { + window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1)); + window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1)); + + Iterator input(in, window_in); + Iterator output(out, window_out); + + // Compute left-over elements along the y dimension (1x1) + execute_window_loop(window_in, [&](const Coordinates & id) + { + const uint8_t val0 = *input.ptr(); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes; + + *(output.ptr() + dst_offset_in_bytes) = val0; + }, + input, output); + } +} + +void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &window) +{ + const int window_step_x = 4; + const int window_step_y = 4; + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_start_y = window.y().start(); + const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1))); + const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y; + const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1]; + const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1]; + + // Check if we need a left-over loop for the y dimension + bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0); + + Window window_in(window); + window_in.set(Window::DimX, Window::Dimension(0, 1, 1)); + if(left_over_loop_y) + { + // Check if window_end_y_multiple_of is greater than window_start_y + if(window_end_y_multiple_of > window_start_y) + { + window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y)); + } + else + { + window_in.set(Window::DimY, Window::Dimension(0, 0, 1)); + } + } + + Window window_out(window); + window_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator output(out, window_out); + + // Run the SIMD path if and only if the input is not a row-vector + if(in->info()->dimension(1) != 1) + { + Iterator input(in, window_in); + execute_window_loop(window_in, [&](const Coordinates & id) + { + // Compute 4x4 elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint16x4_t row0 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x); + const uint16x4_t row1 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x); + const uint16x4_t row2 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x); + const uint16x4_t row3 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x); + + // Transpose 2x2 + const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1); + const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3); + + // Transpose 4x4 + const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0])); + const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1])); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes; + + vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[0])); + vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[0])); + vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[1])); + vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[1])); + } + + // Compute left-over elements (1x4) + for(; x < window_end_x; ++x) + { + const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x); + const uint16_t val1 = *(reinterpret_cast<uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x); + const uint16_t val2 = *(reinterpret_cast<uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x); + const uint16_t val3 = *(reinterpret_cast<uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x); + + uint16x4_t result = vdup_n_u16(0); + result = vset_lane_u16(val0, result, 0); + result = vset_lane_u16(val1, result, 1); + result = vset_lane_u16(val2, result, 2); + result = vset_lane_u16(val3, result, 3); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes; + + vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes), result); + } + }, + input, output); + } + + if(left_over_loop_y) + { + window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1)); + window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1)); + + Iterator input(in, window_in); + Iterator output(out, window_out); + + // Compute left-over elements along the y dimension (1x1) + execute_window_loop(window_in, [&](const Coordinates & id) + { + const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr())); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes; + + *(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes)) = val0; + }, + input, output); + } +} + +void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window) +{ + const int window_step_x = 4; + const int window_step_y = 4; + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_start_y = window.y().start(); + const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1))); + const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y; + const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1]; + const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1]; + + // Check if we need a left-over loop for the y dimension + bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0); + + Window window_in(window); + window_in.set(Window::DimX, Window::Dimension(0, 1, 1)); + if(left_over_loop_y) + { + // Check if window_end_y_multiple_of is greater than window_start_y + if(window_end_y_multiple_of > window_start_y) + { + window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y)); + } + else + { + window_in.set(Window::DimY, Window::Dimension(0, 0, 1)); + } + } + + Window window_out(window); + window_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator output(out, window_out); + + // Run the SIMD path if and only if the input is not a row-vector + if(in->info()->dimension(1) != 1) + { + Iterator input(in, window_in); + execute_window_loop(window_in, [&](const Coordinates & id) + { + // Compute 4x4 elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint32x4_t row0 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x); + const uint32x4_t row1 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x); + const uint32x4_t row2 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x); + const uint32x4_t row3 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x); + + // Transpose 2x2 + const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1)); + const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3)); + const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1)); + const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3)); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; + + // Swap block 01 with block 10 and store + vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vcombine_u32(k0_u32.val[0], k3_u32.val[0])); + vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vcombine_u32(k0_u32.val[1], k3_u32.val[1])); + vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vcombine_u32(k2_u32.val[0], k1_u32.val[0])); + vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vcombine_u32(k2_u32.val[1], k1_u32.val[1])); + } + + // Compute left-over elements (1x4) + for(; x < window_end_x; ++x) + { + const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x); + const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x); + const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x); + const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x); + + uint32x4_t result = vdupq_n_u32(0); + result = vsetq_lane_u32(val0, result, 0); + result = vsetq_lane_u32(val1, result, 1); + result = vsetq_lane_u32(val2, result, 2); + result = vsetq_lane_u32(val3, result, 3); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes; + + vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), result); + } + }, + input, output); + } + + if(left_over_loop_y) + { + window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1)); + window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1)); + + Iterator input(in, window_in); + Iterator output(out, window_out); + + // Compute left-over elements along the y dimension (1x1) + execute_window_loop(window_in, [&](const Coordinates & id) + { + const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr())); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes; + + *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0; + }, + input, output); + } +} +} // namespace + +void CpuTransposeKernel::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Destination auto inizialitation if not yet initialized + const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src); + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst)); + + // Note: This kernel performs 16 elements per iteration. + // However, since we use a left-over for loop on both dimensions (X and Y), we cannot have any read or write out of memory + // For this reason num_elems_processed_per_iteration_x is set to 1 + const unsigned int num_elems_processed_per_iteration_x = 1; + const unsigned int num_elems_processed_per_iteration_y = num_elems_processed(src->element_size()); + + // Configure kernel window + Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + // The CpuTranspose doesn't need padding so update_window_and_padding() can be skipped + Coordinates coord; + coord.set_num_dimensions(dst->num_dimensions()); + dst->set_valid_region(ValidRegion(coord, dst->tensor_shape())); + + ICpuKernel::configure(win); +} + +Status CpuTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions. + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + + // Error if input is not 8 bit, 16bit or 32bit + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->element_size() != 1 && src->element_size() != 2 && src->element_size() != 4, + "Element size not supported"); + + // Validate configured destination + if(dst->total_size() != 0) + { + const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + } + + return Status{}; +} + +void CpuTransposeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + switch(src->info()->element_size()) + { + case 1: + transpose_8bit_elements(src, dst, window); + break; + case 2: + transpose_16bit_elements(src, dst, window); + break; + case 4: + transpose_32bit_elements(src, dst, window); + break; + default: + ARM_COMPUTE_ERROR("Element size not supported"); + break; + } +} + +const char *CpuTransposeKernel::name() const +{ + return "CpuTransposeKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuTransposeKernel.h b/src/cpu/kernels/CpuTransposeKernel.h new file mode 100644 index 0000000000..6805eac642 --- /dev/null +++ b/src/cpu/kernels/CpuTransposeKernel.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_TRANSPOSE_KERNEL_H +#define ARM_COMPUTE_CPU_TRANSPOSE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel which transposes the elements of a matrix */ +class CpuTransposeKernel : public ICpuKernel +{ +public: + CpuTransposeKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuTransposeKernel); + /** Configure kernel for a given list of arguments + * + * @param[in] src Srouce tensor to permute. Data types supported: All + * @param[out] dst Destination tensor. Data types supported: Same as @p src + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuTransposeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_TRANSPOSE_KERNEL_H */ diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.cpp b/src/cpu/kernels/CpuWeightsReshapeKernel.cpp new file mode 100644 index 0000000000..2ccc977995 --- /dev/null +++ b/src/cpu/kernels/CpuWeightsReshapeKernel.cpp @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuWeightsReshapeKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +TensorShape get_output_shape(const ITensorInfo *src, bool has_bias) +{ + TensorShape output_shape{ src->tensor_shape() }; + + output_shape.collapse(3); + const size_t tmp_dim = output_shape[0]; + output_shape.set(0, output_shape[1]); + output_shape.set(1, tmp_dim + (has_bias ? 1 : 0)); + + return output_shape; +} + +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions. + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + + if(biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(src->data_type())); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); + ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->num_dimensions() != 1)); + ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->num_dimensions() != 2)); + ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->dimension(0) != src->tensor_shape()[3])); + ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->dimension(0) != src->tensor_shape()[3] || biases->dimension(1) != src->tensor_shape()[4])); + } + + // Checks performed when output is configured + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), get_output_shape(src, biases != nullptr)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + } + + return Status{}; +} +} // namespace + +void CpuWeightsReshapeKernel::configure(const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(get_output_shape(src, (biases != nullptr)))); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, + biases, + dst)); + + // Configure kernel + Window window = calculate_max_window(*src, Steps()); + window.set(Window::DimX, Window::Dimension(0, src->dimension(0), src->dimension(0))); + window.set(Window::DimY, Window::Dimension(0, src->dimension(1), src->dimension(1))); + window.set(Window::DimZ, Window::Dimension(0, src->dimension(2), src->dimension(2))); + ICpuKernel::configure(window); +} + +Status CpuWeightsReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, biases, dst)); + return Status{}; +} + +void CpuWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto biases = tensors.get_const_tensor(TensorType::ACL_BIAS); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + const unsigned int kernel_size_x = src->info()->dimension(0); + const unsigned int kernel_size_y = src->info()->dimension(1); + const unsigned int kernel_depth = src->info()->dimension(2); + const unsigned int input_stride_x = src->info()->strides_in_bytes().x(); + const unsigned int input_stride_y = src->info()->strides_in_bytes().y(); + const unsigned int input_stride_z = src->info()->strides_in_bytes().z(); + const unsigned int output_stride_y = dst->info()->strides_in_bytes().y(); + + // Create iterators + Iterator in(src, window); + execute_window_loop(window, [&](const Coordinates & id) + { + // Get column index + const int kernel_idx = id[3]; + const int kernel_idz = id[4]; + + // Setup pointers + const uint8_t *tmp_input_ptr = in.ptr(); + uint8_t *tmp_output_ptr = dst->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz)); + const uint8_t *curr_input_row_ptr = tmp_input_ptr; + const uint8_t *curr_input_depth_ptr = tmp_input_ptr; + + // Linearize volume + for(unsigned int d = 0; d < kernel_depth; ++d) + { + for(unsigned int j = 0; j < kernel_size_y; ++j) + { + for(unsigned int i = 0; i < kernel_size_x; ++i) + { + std::memcpy(tmp_output_ptr, tmp_input_ptr, src->info()->element_size()); + tmp_input_ptr += input_stride_x; + tmp_output_ptr += output_stride_y; + } + curr_input_row_ptr += input_stride_y; + tmp_input_ptr = curr_input_row_ptr; + } + curr_input_depth_ptr += input_stride_z; + curr_input_row_ptr = curr_input_depth_ptr; + tmp_input_ptr = curr_input_depth_ptr; + } + + // Add bias + if(biases != nullptr) + { + std::memcpy(tmp_output_ptr, biases->ptr_to_element(Coordinates(kernel_idx, kernel_idz)), src->info()->element_size()); + } + }, + in); +} +const char *CpuWeightsReshapeKernel::name() const +{ + return "CpuWeightsReshapeKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.h b/src/cpu/kernels/CpuWeightsReshapeKernel.h new file mode 100644 index 0000000000..c80bf3b25e --- /dev/null +++ b/src/cpu/kernels/CpuWeightsReshapeKernel.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_WEIGHTSRESHAPE_KERNEL_H +#define ARM_COMPUTE_CPU_WEIGHTSRESHAPE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** Kernel to perform reshaping on the weights used by convolution and locally connected layer + * + * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels. + * In combination with the @ref cpu::kernels::CpuIm2ColKernel can transform a convolution to a matrix multiplication. + * + * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have: + * @f[ + * \left( \begin{array}{ccc} + * a000 & a001 & a002 \\ + * a010 & a011 & a012 \\ + * a020 & a021 & a022 \\ + * \end{array} \right) + * \left( \begin{array}{ccc} + * a100 & a101 & a102 \\ + * a110 & a111 & a112 \\ + * a120 & a121 & a122 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{ccccccccc} + * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\ + * \end{array} \right) + * @f] + */ +class CpuWeightsReshapeKernel : public ICpuKernel +{ +public: + /** Default constructor */ + CpuWeightsReshapeKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuWeightsReshapeKernel); + /** Set the input and output of the kernel. + * + * @param[in] src The input tensor info to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared, + * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. + * Data types supported: All + * @param[in] biases The shared biases tensor info to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with + * dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input + * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. + * @param[out] dst The output tensor info. Data types supported: Same as @p src + */ + void configure(const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuWeightsReshapeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_WEIGHTSRESHAPE_KERNEL_H */ diff --git a/src/cpu/kernels/CpuWinogradConv2dKernel.cpp b/src/cpu/kernels/CpuWinogradConv2dKernel.cpp new file mode 100644 index 0000000000..803af09a67 --- /dev/null +++ b/src/cpu/kernels/CpuWinogradConv2dKernel.cpp @@ -0,0 +1,551 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/CpuWinogradConv2dKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/NEON/kernels/convolution/common/utils.hpp" +#include "src/core/NEON/kernels/convolution/winograd/winograd_layer.hpp" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +//Batched Gemms + +namespace +{ +inline bool is_kernel_size_supported(DataType data_type, Size2D size) +{ + const std::array<Size2D, 8> f32_support = { { Size2D(1, 3), Size2D(3, 1), Size2D(5, 5), Size2D(3, 3), Size2D(1, 5), Size2D(5, 1), Size2D(7, 1), Size2D(1, 7) } }; + const std::array<Size2D, 8> f16_support = { { Size2D(3, 3) } }; + + switch(data_type) + { + case DataType::F16: + return std::end(f16_support) != std::find(std::begin(f16_support), std::end(f16_support), size); + case DataType::F32: + return std::end(f32_support) != std::find(std::begin(f32_support), std::end(f32_support), size); + default: + return false; + } +} + +Status validate_arguments_winograd_weight_trans(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + + const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); + const auto input_width = input->dimension(idx_width); + const auto input_height = input->dimension(idx_height); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(input->data_type(), Size2D(input_width, input_height)), + "Only 1x3, 3x1, 1x5, 5x1, 7x1, 1x7, 3x3 and 5x5 kernels are supported"); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); + const Size2D &output_tile = winograd_info.output_tile_size; + const std::array<Size2D, 8> supported_tile_sizes = { { Size2D(2U, 2U), Size2D(4U, 4U), Size2D(1U, 6U), Size2D(6U, 1U), Size2D(4, 1), Size2D(1, 4), Size2D(2, 1), Size2D(1, 2) } }; + ARM_COMPUTE_RETURN_ERROR_ON(std::end(supported_tile_sizes) == std::find(std::begin(supported_tile_sizes), std::end(supported_tile_sizes), output_tile)); + + // Checks performed when output is configured + if(output->total_size() != 0) + { + const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_filter_transform_shape(*input, winograd_info)); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window_winograd_weight_trans(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info) +{ + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(*output, input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_filter_transform_shape(*input, winograd_info))); + const Window win = calculate_max_window(*input, Steps(), true /* skip border*/); + return std::make_pair(Status{}, win); +} + +Status validate_arguments_winograd_input_trans(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info) +{ + const Size2D &kernel_dims = winograd_info.kernel_size; + const PadStrideInfo &conv_info = winograd_info.convolution_info; + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(input->data_type(), Size2D(kernel_dims.width, kernel_dims.height)), + "Only 1x3, 3x1, 3x3 and 5x5 kernels are supported"); + + // Validate configured output + if(output->total_size() != 0) + { + const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window_winograd_input_trans(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info) +{ + const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info); + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape)); + return std::make_pair(Status{}, calculate_max_window(*input, Steps(), true)); +} + +Status validate_arguments_winograd_output_trans(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info) +{ + const PadStrideInfo &conv_info = winograd_info.convolution_info; + const Size2D kernel_dims = winograd_info.kernel_size; + + // Number of tiles along the X and Y direction + const unsigned int num_tiles_x = std::ceil((winograd_info.input_dimensions.x() - (kernel_dims.width - 1) + conv_info.pad_left() + conv_info.pad_right()) / static_cast<float> + (winograd_info.output_tile_size.width)); + const unsigned int num_tiles_y = std::ceil((winograd_info.input_dimensions.y() - (kernel_dims.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / static_cast<float> + (winograd_info.output_tile_size.height)); + const Size2D num_tiles = Size2D(num_tiles_x, num_tiles_y); + + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != num_tiles.area()); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(input->data_type(), Size2D(kernel_dims.width, kernel_dims.height)), + "Only 1x3, 3x1, 3x3 and 5x5 kernels are supported"); + + const std::array<unsigned int, 3> supported_gemm_sizes = { { 8U, 16U, 36U } }; + ARM_COMPUTE_RETURN_ERROR_ON(std::end(supported_gemm_sizes) == std::find(std::begin(supported_gemm_sizes), std::end(supported_gemm_sizes), input->dimension(2))); + ARM_COMPUTE_UNUSED(kernel_dims); + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != size_t(1)); + } + + // Checks performed when output is configured + if(output->total_size() != 0) + { + const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_output_transform_shape(*input, winograd_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window_winograd_output_trans(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info) +{ + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*output, input->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_winograd_output_transform_shape(*input, winograd_info))); + + return std::make_pair(Status{}, calculate_max_window(*input, Steps(), true)); +} +} // namespace + +Status ICpuWinogradConv2dTransformWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *weights) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + const DataLayout data_layout = input->data_layout(); + const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_kernel_size_supported(input->data_type(), Size2D(weights->dimension(width_idx), weights->dimension(height_idx))), + "Only 1x3, 3x1, 3x3 and 5x5 kernels are supported"); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); + return Status{}; +} + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +unsigned int CpuWinogradConv2dTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_weight_storage_size(int num_output_channels, int num_input_channels) const +{ + const KernelShape shape(num_output_channels, KernelRows, KernelCols, num_input_channels); + // WinogradConv returns the size in bytes, we divide by `sizeof(T)` to express that in units of T + return static_cast<unsigned int>(WinogradConv::get_kernel_storage_size(num_input_channels, num_output_channels) / sizeof(T)); +} + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +CpuWinogradConv2dTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::CpuWinogradConv2dTransformWeightsKernel() + : _transform(nullptr), _num_output_channels(0), _matrix_stride(0) +{ +} + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +int CpuWinogradConv2dTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride(int num_output_channels, int num_input_channels) const +{ + return WinogradConv::get_kernel_matrix_stride(num_input_channels, num_output_channels); +} + +#ifndef DOXYGEN_SKIP_THIS +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +void CpuWinogradConv2dTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure( + const ITensorInfo *weights_hwio, + ITensorInfo *output, + const int matrix_stride, /** Stride across matrices in the output. */ + const int num_output_channels, /** Number of filters. */ + const int num_input_channels) /** Number of channels in each filter. */ +{ + ARM_COMPUTE_UNUSED(weights_hwio, output); + + _transform = std::make_unique<WeightsTransform>(num_output_channels, num_input_channels); + _num_output_channels = num_output_channels; + _matrix_stride = matrix_stride; + + Window win; + auto win_last = _transform->get_window(); + win.set(Window::DimX, Window::Dimension(0, win_last, 1)); + ICpuKernel::configure(win); +} +#endif /* DOXYGEN_SKIP_THIS */ + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +void CpuWinogradConv2dTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON(tensors.empty()); + + const size_t fst = window.x().start(); + const size_t lst = window.x().end(); + + const ITensor *weights_hwio = tensors.get_const_tensor(TensorType::ACL_SRC); + ITensor *output = tensors.get_tensor(TensorType::ACL_DST); + + _transform->set_weight_tensor(weights_hwio->buffer()); + const int matrix_row_stride = roundup(_num_output_channels, WinogradConv::N_BLOCK); + _transform->set_output_matrices(output->buffer(), _matrix_stride, matrix_row_stride); + _transform->set_working_space(output->buffer()); + + _transform->run(fst, lst); +} + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +bool CpuWinogradConv2dTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::is_parallelisable() const +{ + return false; +} + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +Status CpuWinogradConv2dTransformWeightsKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::validate(const ITensorInfo *input, const ITensorInfo *output, + const WinogradInfo &winograd_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_weight_trans(input, output, winograd_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_winograd_weight_trans(input->clone().get(), output->clone().get(), winograd_info).first); + return Status{}; +} + +template class CpuWinogradConv2dTransformWeightsKernel<float, 2, 2, 3, 3>; +template class CpuWinogradConv2dTransformWeightsKernel<float, 4, 4, 3, 3>; +template class CpuWinogradConv2dTransformWeightsKernel<float, 2, 2, 5, 5>; +template class CpuWinogradConv2dTransformWeightsKernel<float, 1, 6, 1, 3>; +template class CpuWinogradConv2dTransformWeightsKernel<float, 6, 1, 3, 1>; + +template class CpuWinogradConv2dTransformWeightsKernel<float, 1, 4, 1, 5>; +template class CpuWinogradConv2dTransformWeightsKernel<float, 4, 1, 5, 1>; +template class CpuWinogradConv2dTransformWeightsKernel<float, 1, 2, 1, 7>; +template class CpuWinogradConv2dTransformWeightsKernel<float, 2, 1, 7, 1>; + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template class CpuWinogradConv2dTransformWeightsKernel<__fp16, 4, 4, 3, 3>; +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +// Input transform + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +unsigned int CpuWinogradConv2dTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_input_storage_size( + int num_batches, /* Number of batches in the input tensor. */ + int num_channels, /* Number of feature maps in the input tensor. */ + int num_rows, /* Number of rows in each feature map. */ + int num_cols, /* Number of columns in each feature map. */ + bool same_padding /* Use "SAME" padding, otherwise use "VALID". */ +) const +{ + // Construct shapes for the input and kernel tensors. + const Tensor4DShape input_shape(num_batches, num_rows, num_cols, num_channels); + const KernelShape kern_shape(1, KernelRows, KernelCols, num_channels); + // Return the size, converted into units of TIn + return static_cast<unsigned int>(WinogradConv::get_input_storage_size(num_batches, num_rows, num_cols, num_channels, same_padding) / sizeof(T)); +} + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +unsigned int CpuWinogradConv2dTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_working_space_size(unsigned int num_threads) const +{ + return _transform->get_working_space_size(num_threads); +} + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +int CpuWinogradConv2dTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride( + int num_batches, /* Number of batches in the input tensor. */ + int num_channels, /* Number of feature maps in the input tensor. */ + int num_rows, /* Number of rows in each feature map. */ + int num_cols, /* Number of columns in each feature map. */ + bool same_padding /* Use "SAME" padding, otherwise use "VALID". */) const +{ + return WinogradConv::get_input_matrix_stride(num_batches, num_rows, num_cols, num_channels, same_padding); +} + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +CpuWinogradConv2dTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::CpuWinogradConv2dTransformInputKernel() + : _transform(nullptr), _num_channels(0), _matrix_stride(0) +{ +} + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +void CpuWinogradConv2dTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure( + const ITensorInfo *input_nhwc, + const int num_batches, /* Number of batches in input tensor. */ + const int num_rows, /* Number of rows in input tensor. */ + const int num_cols, /* Number of columns in input tensor. */ + const int num_channels, /* Number of channels in input tensor. */ + const PaddingType padding, /* Padding type. */ + ITensorInfo *output, /* Base of output matrices. */ + const int matrix_stride, /* Stride between output matrices. */ + ITensorInfo *workspace) +{ + ARM_COMPUTE_UNUSED(input_nhwc, output, matrix_stride, workspace); + + _num_channels = num_channels; + _matrix_stride = matrix_stride; + + const int padding_top = (padding == PADDING_SAME) ? (KernelRows - 1) / 2 : 0; + const int padding_left = (padding == PADDING_SAME) ? (KernelCols - 1) / 2 : 0; + const int padding_bottom = (padding == PADDING_SAME) ? iceildiv(KernelRows - 1, 2) : 0; + const int padding_right = (padding == PADDING_SAME) ? iceildiv(KernelCols - 1, 2) : 0; + + _transform = std::make_unique<InputTransform>( + KernelRows, + KernelCols, + num_batches, + num_rows, + num_cols, + num_channels, + padding_top, /**< Padding to apply to the top of the image. */ + padding_left, /**< Padding to apply to the left of the image. */ + padding_bottom, /**< Padding to apply to the bottom of the image. */ + padding_right /**< Padding to apply to the right of the image. */ + ); + + Window win; + auto win_last = _transform->get_window(); + win.set(Window::DimX, Window::Dimension(0, win_last, 1)); + ICpuKernel::configure(win); +} + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +void CpuWinogradConv2dTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON(tensors.empty()); + + const ITensor *input_nhwc = tensors.get_const_tensor(TensorType::ACL_SRC); + const ITensor *workspace = tensors.get_const_tensor(TensorType::ACL_INT); + ITensor *output = tensors.get_tensor(TensorType::ACL_DST); + + const int element_size_in_bytes = input_nhwc->info()->element_size(); + const int input_col_stride = input_nhwc->info()->strides_in_bytes().y() / element_size_in_bytes; + const int input_row_stride = input_nhwc->info()->strides_in_bytes().z() / element_size_in_bytes; + const int input_batch_stride = input_nhwc->info()->strides_in_bytes()[3] / element_size_in_bytes; + const auto input_nhwc_ptr = reinterpret_cast<const T *>(input_nhwc->buffer() + input_nhwc->info()->offset_first_element_in_bytes()); + auto output_ptr = reinterpret_cast<T *>(output->buffer() + output->info()->offset_first_element_in_bytes()); + ARM_COMPUTE_ERROR_ON_NULLPTR(output_ptr); + + _transform->set_input_tensor(input_nhwc_ptr, input_batch_stride, input_row_stride, input_col_stride); + _transform->set_output_matrices(output_ptr, _matrix_stride, _num_channels); + + _transform->set_working_space(workspace->buffer()); + + // The code below cannot be moved to configure because biases hasn't been allocated at that point + const size_t fst = window.x().start(); + const size_t lst = window.x().end(); + _transform->run(fst, lst, info.thread_id); +} + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +Status CpuWinogradConv2dTransformInputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::validate(const ITensorInfo *input, const ITensorInfo *output, + const WinogradInfo &winograd_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_input_trans(input, output, winograd_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_winograd_input_trans(input->clone().get(), output->clone().get(), winograd_info).first); + + return Status{}; +} + +template class CpuWinogradConv2dTransformInputKernel<float, 2, 2, 3, 3>; +template class CpuWinogradConv2dTransformInputKernel<float, 4, 4, 3, 3>; +template class CpuWinogradConv2dTransformInputKernel<float, 2, 2, 5, 5>; +template class CpuWinogradConv2dTransformInputKernel<float, 1, 6, 1, 3>; +template class CpuWinogradConv2dTransformInputKernel<float, 6, 1, 3, 1>; + +template class CpuWinogradConv2dTransformInputKernel<float, 1, 4, 1, 5>; +template class CpuWinogradConv2dTransformInputKernel<float, 4, 1, 5, 1>; +template class CpuWinogradConv2dTransformInputKernel<float, 1, 2, 1, 7>; +template class CpuWinogradConv2dTransformInputKernel<float, 2, 1, 7, 1>; + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template class CpuWinogradConv2dTransformInputKernel<__fp16, 4, 4, 3, 3>; +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +// Output transform + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +unsigned int CpuWinogradConv2dTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_storage_size( + int num_batches, /* Number of batches in the output tensor. */ + int num_rows, /* Number of rows in each feature map of the input tensor. */ + int num_cols, /* Number of columns in each feature map of the input tensor. */ + int num_output_channels /* Number of feature maps in the output tensor. */ +) const +{ + // Construct shapes for the input and kernel tensors. + const Tensor4DShape input_shape(num_batches, num_rows, num_cols, 1); + const KernelShape kern_shape(num_output_channels, KernelRows, KernelCols, 1); + // Return the size, converted into units of TOut + return static_cast<unsigned int>( + WinogradConv::get_output_storage_size(num_batches, num_rows, num_cols, num_output_channels) / sizeof(T)); +} + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +CpuWinogradConv2dTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::CpuWinogradConv2dTransformOutputKernel() + : _transform(nullptr), _matrix_stride(0), _matrix_row_stride(0) +{ +} + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +unsigned int CpuWinogradConv2dTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_working_space_size(unsigned int num_threads) const +{ + return _transform->get_working_space_size(num_threads); +} + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +int CpuWinogradConv2dTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_matrix_stride( + int num_batches, /* Number of batches in the output tensor. */ + int num_rows, /* Number of rows in each feature map of the input tensor. */ + int num_cols, /* Number of columns in each feature map of the input tensor. */ + int num_output_channels /* Number of feature maps in the output tensor. */ +) const +{ + return WinogradConv::get_output_matrix_stride(num_batches, num_rows, num_cols, num_output_channels); +} + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +std::pair<unsigned int, unsigned int> CpuWinogradConv2dTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::get_output_shape( + int num_rows, /* Number of rows in each feature map of the input tensor. */ + int num_cols, /* Number of columns in each feature map of the input tensor. */ + bool padding_same) const +{ + return WinogradConv::get_output_shape(std::make_pair<unsigned int, unsigned int>(num_rows, num_cols), padding_same); +} + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +void CpuWinogradConv2dTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::configure( + const ITensorInfo *biases, + const ITensorInfo *transformed_output, + const int matrix_stride, + ITensorInfo *output_nhwc, + const int num_batches, + const int num_rows, + const int num_cols, + const int num_channels, + ITensorInfo *workspace, + const arm_gemm::Activation &activation) +{ + ARM_COMPUTE_UNUSED(biases, transformed_output, output_nhwc, num_batches, num_rows, num_cols, workspace, activation); + + _matrix_stride = matrix_stride; + _matrix_row_stride = roundup(num_channels, WinogradConv::N_BLOCK); + + // We don't have the biases buffer at this stage as it hasn't been allocated, we pass in nullptr OutputTransform is only used here to compute the window + _transform = std::make_unique<OutputTransform>(num_batches, num_rows, num_cols, num_channels, activation); + Window win; + auto win_last = _transform->get_window(); + win.set(Window::DimX, Window::Dimension(0, win_last, 1)); + + ICpuKernel::configure(win); +} + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +void CpuWinogradConv2dTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON(tensors.empty()); + + const ITensor *biases = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const ITensor *transformed_output = tensors.get_const_tensor(TensorType::ACL_SRC_1); + ITensor *workspace = tensors.get_tensor(TensorType::ACL_INT); + ITensor *dst_nhwc = tensors.get_tensor(TensorType::ACL_DST); + + const int out_batch_stride = dst_nhwc->info()->strides_in_bytes()[3] / sizeof(T); + const int out_row_stride = dst_nhwc->info()->strides_in_bytes()[2] / sizeof(T); + const int out_col_stride = dst_nhwc->info()->strides_in_bytes()[1] / sizeof(T); + + _transform->set_input_matrices(transformed_output->buffer(), _matrix_stride, _matrix_row_stride); + _transform->set_bias((biases ? reinterpret_cast<T *>(biases->buffer() + biases->info()->offset_first_element_in_bytes()) : nullptr)); + _transform->set_output_tensor(dst_nhwc->buffer() + dst_nhwc->info()->offset_first_element_in_bytes(), out_batch_stride, out_row_stride, out_col_stride); + _transform->set_working_space(workspace->buffer()); + + // The code below cannot be moved to configure because biases hasn't been allocated at that point + const size_t fst = window.x().start(); + const size_t lst = window.x().end(); + _transform->run(fst, lst, info.thread_id); +} + +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +Status CpuWinogradConv2dTransformOutputKernel<T, OutputTileRows, OutputTileCols, KernelRows, KernelCols>::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, + const WinogradInfo &winograd_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_winograd_output_trans(input, (bias != nullptr ? bias->clone().get() : nullptr), output, winograd_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_winograd_output_trans(input->clone().get(), output->clone().get(), winograd_info).first); + + return Status{}; +} + +template class CpuWinogradConv2dTransformOutputKernel<float, 2, 2, 3, 3>; +template class CpuWinogradConv2dTransformOutputKernel<float, 4, 4, 3, 3>; +template class CpuWinogradConv2dTransformOutputKernel<float, 2, 2, 5, 5>; +template class CpuWinogradConv2dTransformOutputKernel<float, 1, 6, 1, 3>; +template class CpuWinogradConv2dTransformOutputKernel<float, 6, 1, 3, 1>; + +template class CpuWinogradConv2dTransformOutputKernel<float, 1, 4, 1, 5>; +template class CpuWinogradConv2dTransformOutputKernel<float, 4, 1, 5, 1>; +template class CpuWinogradConv2dTransformOutputKernel<float, 1, 2, 1, 7>; +template class CpuWinogradConv2dTransformOutputKernel<float, 2, 1, 7, 1>; + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template class CpuWinogradConv2dTransformOutputKernel<__fp16, 4, 4, 3, 3>; +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/CpuWinogradConv2dKernel.h b/src/cpu/kernels/CpuWinogradConv2dKernel.h new file mode 100644 index 0000000000..db2d8acfdb --- /dev/null +++ b/src/cpu/kernels/CpuWinogradConv2dKernel.h @@ -0,0 +1,575 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPUWINOGRADCONV2DKERNEL_H +#define ARM_COMPUTE_CPUWINOGRADCONV2DKERNEL_H + +#include "src/core/NEON/kernels/convolution/common/convolution.hpp" +#include "src/core/NEON/kernels/convolution/common/tensor.hpp" +#include "src/cpu/ICpuKernel.h" + +#include "src/core/NEON/kernels/convolution/winograd/winograd_layer.hpp" + +namespace arm_compute +{ +namespace cpu +{ +/** Interface for the kernel to perform Winograd input transform. */ +class ICpuWinogradConv2dTransformInputKernel : public ICpuKernel +{ +public: + /** Get the working space required to perform the transformation. + * + * Note, the working space is only required when performing the + * transformation - hence it can be reused whenever the transformation is + * not running. + * + * @param num_threads The greatest number of threads that will be used to execute the transform. + * @return Size of working space required in bytes. + */ + virtual unsigned int get_working_space_size(unsigned int num_threads) const = 0; + + /** Determine how much memory (in units of TIn) to allocate for the + * transformed input. + * + * @param[in] num_batches Number of batches in the input tensor. + * @param[in] num_channels Number of feature maps in the input tensor. + * @param[in] num_rows Number of rows in each feature map. + * @param[in] num_cols Number of columns in each feature map. + * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". + * + * @return Storage size (in units of TIn) required. + */ + virtual unsigned int get_input_storage_size(int num_batches, int num_channels, int num_rows, int num_cols, bool same_padding) const = 0; + + /** Gets the stride between matrices in the input worspace + * + * @param[in] num_batches Number of batches in the input tensor. + * @param[in] num_channels Number of feature maps in the input tensor. + * @param[in] num_rows Number of rows in each feature map. + * @param[in] num_cols Number of columns in each feature map. + * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". + * + * @return Stride expressed in bytes. + */ + virtual int get_matrix_stride(int num_batches, int num_channels, int num_rows, int num_cols, bool same_padding) const = 0; + + /** Configure the output transform kernel. + * + * @param[in] input_nhwc Input tensor in NHWC data layout format. + * @param[in] num_batches Number of batches in input tensor. + * @param[in] num_rows Number of rows in input tensor. + * @param[in] num_cols Number of columns in input tensor. + * @param[in] num_channels Number of channels in input tensor. + * @param[in] padding Padding type. + * @param[out] output Base of output matrices. + * @param[in] matrix_stride Stride between output matrices. + * @param[in] workspace Tensor to be used as the working space during the computation. + */ + virtual void configure(const ITensorInfo *input_nhwc, const int num_batches, const int num_rows, const int num_cols, const int num_channels, + const PaddingType padding, ITensorInfo *output, const int matrix_stride, ITensorInfo *workspace) = 0; + + /** Destructor */ + virtual ~ICpuWinogradConv2dTransformInputKernel() + { + } +}; + +/** Kernel to perform Winograd input transform. */ +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +class CpuWinogradConv2dTransformInputKernel : public ICpuWinogradConv2dTransformInputKernel +{ +public: + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuWinogradConv2dTransformInputKernel(const CpuWinogradConv2dTransformInputKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuWinogradConv2dTransformInputKernel &operator=(const CpuWinogradConv2dTransformInputKernel &) = delete; + /** Allow instances of this class to be moved */ + CpuWinogradConv2dTransformInputKernel(CpuWinogradConv2dTransformInputKernel &&) = default; + /** Allow instances of this class to be moved */ + CpuWinogradConv2dTransformInputKernel &operator=(CpuWinogradConv2dTransformInputKernel &&) = default; + /** Default destructor */ + ~CpuWinogradConv2dTransformInputKernel() = default; + + /** Determine how much memory (in units of TIn) to allocate for the + * transformed input. + * + * @param[in] num_batches Number of batches in the input tensor. + * @param[in] num_channels Number of feature maps in the input tensor. + * @param[in] num_rows Number of rows in each feature map. + * @param[in] num_cols Number of columns in each feature map. + * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". + * + * @return Storage size (in units of TIn) required. + */ + unsigned int get_input_storage_size( + int num_batches, + int num_channels, + int num_rows, + int num_cols, + bool same_padding) const override; + + /** Get the working space required to perform the transformation. + * + * Note, the working space is only required when performing the + * transformation - hence it can be reused whenever the transformation is + * not running. + * + * @param[in] num_threads The greatest number of threads that will be used to execute the transform. + * + * @return Size of working space required in bytes. + */ + unsigned int get_working_space_size(unsigned int num_threads) const override; + + /** Gets the stride between matrices in the input worspace + * + * @param[in] num_batches Number of batches in the input tensor. + * @param[in] num_channels Number of feature maps in the input tensor. + * @param[in] num_rows Number of rows in each feature map. + * @param[in] num_cols Number of columns in each feature map. + * @param[in] same_padding Use "SAME" padding, otherwise use "VALID". + * + * @return Stride expressed in bytes. + */ + int get_matrix_stride( + int num_batches, + int num_channels, + int num_rows, + int num_cols, + bool same_padding) const override; + + /** Default constructor */ + CpuWinogradConv2dTransformInputKernel(); + + const char *name() const override + { + return "CpuWinogradConv2dTransformInputKernel"; + } + + /** Configure the output transform kernel. + * + * @param[in] input_nhwc Input tensor. Data types supported: F16/F32. Layout supported NHWC. + * @param[in] num_batches Number of batches in input tensor. + * @param[in] num_rows Number of rows in input tensor. + * @param[in] num_cols Number of columns in input tensor. + * @param[in] num_channels Number of channels in input tensor. + * @param[in] padding Padding type. + * @param[out] output Base of output matrices. + * @param[in] matrix_stride Stride between output matrices. + * @param[in] workspace Tensor to be used as the working space during the computation. + */ + void configure( + const ITensorInfo *input_nhwc, + const int num_batches, + const int num_rows, + const int num_cols, + const int num_channels, + const PaddingType padding, + ITensorInfo *output, + const int matrix_stride, + ITensorInfo *workspace) override; + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + + /** Winograd base kernel */ + using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>; + /** Winograd convolution kernel */ + using WinogradConv = typename WinogradBase::template Convolution<T, T>; + + /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2dTransformInputKernel + * + * @param[in] input First tensor input info. Data types supported: F16/F32. + * @param[in] output Output tensor info. Data types supported: same as @p input. + * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info); + +private: + using InputTransform = typename WinogradBase::template InputTransform<T, T>; + + std::unique_ptr<InputTransform> _transform{ nullptr }; + int _num_channels; /**< Number of channels in input tensor. */ + int _matrix_stride; /**< Stride between output matrices. */ +}; + +/** Interface for the kernel to perform Winograd output transform. */ +class ICpuWinogradConv2dTransformOutputKernel : public ICpuKernel +{ +public: + /** Get the working space required to perform the transformation. + * + * Note, the working space is only required when performing the + * transformation - hence it can be reused whenever the transformation is + * not running. + * + * @param[in] num_threads The greatest number of threads that will be used to execute the transform. + * + * @return Size of working space required in bytes. + */ + virtual unsigned int get_working_space_size(unsigned int num_threads) const = 0; + + /** Determine how much memory (in units of TOut) to allocate for the + * (Winograd domain) output. + * + * @param[in] num_batches Number of batches in the output tensor. + * @param[in] num_rows Number of rows in each feature map of the input tensor. + * @param[in] num_cols Number of columns in each feature map of the input tensor. + * @param[in] num_output_channels Number of feature maps in the output tensor. + * + * @return Storage size (in units of TOut) required. + */ + virtual unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels) const = 0; + + /** Gets the stride between matrices in the output worspace + * + * @param[in] num_batches Number of batches in the output tensor. + * @param[in] num_rows Number of rows in each feature map of the input tensor. + * @param[in] num_cols Number of columns in each feature map of the input tensor. + * @param[in] num_output_channels Number of feature maps in the output tensor. + * + * @return Stride expressed in bytes. + */ + virtual int get_matrix_stride(int num_batches, int num_rows, int num_cols, int num_output_channels) const = 0; + + /** Get the output shape of a convolution. + * + * @param[in] num_rows Number of rows in each feature map of the input tensor. + * @param[in] num_cols Number of columns in each feature map of the input tensor. + * @param[in] padding_same True if padding is SAME, false otherwise + * + * @return Shape of the output tensor + */ + virtual std::pair<unsigned int, unsigned int> get_output_shape( + int num_rows, /* Number of rows in each feature map of the input tensor. */ + int num_cols, /* Number of columns in each feature map of the input tensor. */ + bool padding_same /* True if padding is SAME, false otherwise */ + ) const = 0; + + /** Configure the output transform kernel. + * + * @param[in] biases Pointer to the biases tensor. + * @param[in] transformed_output Pointer to working space for the output tensor in the Winograd domain. + * @param[in] matrix_stride Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>::get_output_matrix_stride() + * @param[out] output_nhwc Pointer to a tensor in NHWC data layout ordered output tensor, in the spatial domain. + * @param[in] num_batches Number of batches in the input tensor. + * @param[in] num_rows Number of rows in output tensor. + * @param[in] num_cols Number of columns in output tensor. + * @param[in] num_channels Number of feature maps in the output tensor. + * @param[in] workspace Tensor to be used as the working space during the computation. + * @param[in] activation Activation to be used + */ + virtual void configure( + const ITensorInfo *biases, + const ITensorInfo *transformed_output, + const int matrix_stride, + ITensorInfo *output_nhwc, + const int num_batches, + const int num_rows, + const int num_cols, + const int num_channels, + ITensorInfo *workspace, + const arm_gemm::Activation &activation) = 0; + + virtual ~ICpuWinogradConv2dTransformOutputKernel() + { + } +}; + +/** Kernel to perform Winograd output transform. */ +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +class CpuWinogradConv2dTransformOutputKernel : public ICpuWinogradConv2dTransformOutputKernel +{ +public: + const char *name() const override + { + return "CpuWinogradConv2dTransformOutputKernel"; + } + /** Constructor */ + CpuWinogradConv2dTransformOutputKernel(); + + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuWinogradConv2dTransformOutputKernel(const CpuWinogradConv2dTransformOutputKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuWinogradConv2dTransformOutputKernel &operator=(const CpuWinogradConv2dTransformOutputKernel &) = delete; + /** Allow instances of this class to be moved */ + CpuWinogradConv2dTransformOutputKernel(CpuWinogradConv2dTransformOutputKernel &&) = default; + /** Allow instances of this class to be moved */ + CpuWinogradConv2dTransformOutputKernel &operator=(CpuWinogradConv2dTransformOutputKernel &&) = default; + /** Default destructor */ + ~CpuWinogradConv2dTransformOutputKernel() = default; + + // Inherited methods overridden: + /** Determine how much memory (in units of TOut) to allocate for the + * (Winograd domain) output. + * + * @param[in] num_batches Number of batches in the output tensor. + * @param[in] num_rows Number of rows in each feature map of the input tensor. + * @param[in] num_cols Number of columns in each feature map of the input tensor. + * @param[in] num_output_channels Number of feature maps in the output tensor. + * + * @return Storage size (in units of TOut) required. + */ + unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels) const override; + + /** Gets the stride between matrices in the output worspace + * + * @param[in] num_batches Number of batches in the output tensor. + * @param[in] num_rows Number of rows in each feature map of the input tensor. + * @param[in] num_cols Number of columns in each feature map of the input tensor. + * @param[in] num_output_channels Number of feature maps in the output tensor. + * + * @return Stride expressed in bytes. + */ + int get_matrix_stride(int num_batches, int num_rows, int num_cols, int num_output_channels) const override; + /** Get the output shape of a convolution. + * + * @param[in] num_rows Number of rows in each feature map of the input tensor. + * @param[in] num_cols Number of columns in each feature map of the input tensor. + * @param[in] padding_same True if padding is SAME, false otherwise + * + * @return Shape of the output tensor + */ + std::pair<unsigned int, unsigned int> get_output_shape( + int num_rows, /* Number of rows in each feature map of the input tensor. */ + int num_cols, /* Number of columns in each feature map of the input tensor. */ + bool padding_same) const override; + + /** Get the working space required to perform the transformation. + * + * Note, the working space is only required when performing the + * transformation - hence it can be reused whenever the transformation is + * not running. + * + * @param[in] num_threads The greatest number of threads that will be used to execute the transform. + * + * @return Size of working space required in bytes. + */ + unsigned int get_working_space_size(unsigned int num_threads) const override; + + /** Configure the output transform kernel. + * + * @param[in] biases Pointer to the biases tensor. + * @param[in] transformed_output Pointer to working space for the output tensor in the Winograd domain. + * @param[in] matrix_stride Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>::get_output_matrix_stride() + * @param[out] output_nhwc Pointer to a tensor with NHWC data layout, in the spatial domain. + * @param[in] num_batches Number of batches in the input tensor. + * @param[in] num_rows Number of rows in output tensor. + * @param[in] num_cols Number of columns in output tensor. + * @param[in] num_channels Number of feature maps in the output tensor. + * @param[in] workspace Tensor to be used as the working space during the computation. + * @param[in] activation Activation to be used + */ + void configure( + const ITensorInfo *biases, + const ITensorInfo *transformed_output, + const int matrix_stride, + ITensorInfo *output_nhwc, + const int num_batches, + const int num_rows, + const int num_cols, + const int num_channels, + ITensorInfo *workspace, + const arm_gemm::Activation &activation) override; + + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + + /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2dTransformOutputKernel + * + * @param[in] input Source tensor info with shape [C, N, 16, batches] or [C, N, 36, batches]. Data types supported: F16/F32. + * @param[in] bias Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input + * @param[in] output Destination tensor info with shape [output_convolved_dims.width, output_convolved_dims.height, C, batches]. Data type supported: same as @p input + * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info); + +private: + using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>; + using WinogradConv = typename WinogradBase::template Convolution<T, T>; + using OutputTransform = typename WinogradBase::template OutputTransform<T, T>; + + std::unique_ptr<OutputTransform> _transform{ nullptr }; + int _matrix_stride; + int _matrix_row_stride; +}; + +/** Interface for the kernel to perform Winograd weights transform. */ +class ICpuWinogradConv2dTransformWeightsKernel : public ICpuKernel +{ +public: + /** Prevent instances of this class from being copied (As this class contains pointers) */ + ICpuWinogradConv2dTransformWeightsKernel(const ICpuWinogradConv2dTransformWeightsKernel &) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + ICpuWinogradConv2dTransformWeightsKernel &operator=(const ICpuWinogradConv2dTransformWeightsKernel &) = default; + /** Allow instances of this class to be moved */ + ICpuWinogradConv2dTransformWeightsKernel(ICpuWinogradConv2dTransformWeightsKernel &&) = default; + /** Allow instances of this class to be moved */ + ICpuWinogradConv2dTransformWeightsKernel &operator=(ICpuWinogradConv2dTransformWeightsKernel &&) = default; + + ICpuWinogradConv2dTransformWeightsKernel() + { + } + virtual ~ICpuWinogradConv2dTransformWeightsKernel() + { + } + /** Determine how much memory (in units of T) to allocate for the + * transformed weights. + * + * @param[in] num_output_channels Number of output feature maps. + * @param[in] num_input_channels Number of input feature maps. + * + * @return Storage size (in units of T) required. + */ + virtual unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const = 0; + /** Gets the stride between matrices in the kernel worspace + * + * @param[in] num_output_channels Number of output feature maps. + * @param[in] num_input_channels Number of input feature maps. + * + * @return Stride expressed in bytes. + */ + virtual int get_matrix_stride(int num_output_channels, int num_input_channels) const = 0; + + /** Configure the weights transform kernel. + * + * @param[in] weights_hwio Pointer to the weights tensor info + * @param[out] output Pointer to working space for the output tensor in the Winograd domain. + * @param[in] matrix_stride Stride across matrices in the output workspace. + * @param[in] num_output_channels Number of filters. + * @param[in] num_input_channels Number of channels in each filter. + */ + + virtual void configure(const ITensorInfo *weights_hwio, ITensorInfo *output, const int matrix_stride, const int num_output_channels, const int num_input_channels) = 0; + + /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2dTransformWeightsKernel + * + * @param[in] input First tensor input info. Data types supported: F16/F32. + * @param[in] weights Weights tensor info. Data types supported: same as @p input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights); +}; + +/** Kernel to perform Winograd weights transform. */ +template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +class CpuWinogradConv2dTransformWeightsKernel final : public ICpuWinogradConv2dTransformWeightsKernel +{ +public: + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuWinogradConv2dTransformWeightsKernel(const CpuWinogradConv2dTransformWeightsKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuWinogradConv2dTransformWeightsKernel &operator=(const CpuWinogradConv2dTransformWeightsKernel &) = delete; + /** Allow instances of this class to be moved */ + CpuWinogradConv2dTransformWeightsKernel(CpuWinogradConv2dTransformWeightsKernel &&) = default; + /** Allow instances of this class to be moved */ + CpuWinogradConv2dTransformWeightsKernel &operator=(CpuWinogradConv2dTransformWeightsKernel &&) = default; + /** Default destructor */ + ~CpuWinogradConv2dTransformWeightsKernel() = default; + + /** Default constructor. */ + CpuWinogradConv2dTransformWeightsKernel(); + const char *name() const override + { + return "CpuWinogradConv2dTransformWeightsKernel"; + } + + /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2dTransformWeightsKernel + * + * @param[in] input Source tensor info. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout). + * kernel_x must be 3 and equal to kernel_y. Data types supported: F16/F32. + * @param[in] output Destination tensor info. The output is a 3D tensor with dimensions [OFM, IFM, 16] or [OFM, IFM, 36]. Data type supported: same as @p input + * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info); + + // Inherited methods overridden: + +#ifndef DOXYGEN_SKIP_THIS + /** Configure the weights transform kernel. + * + * @param[in] weights_hwio Pointer to the weights tensor info + * @param[out] output Pointer to working space for the output tensor in the Winograd domain. + * @param[in] matrix_stride Stride across matrices in the output workspace. + * @param[in] num_output_channels Number of filters. + * @param[in] num_input_channels Number of channels in each filter. + */ + void configure(const ITensorInfo *weights_hwio, ITensorInfo *output, const int matrix_stride, const int num_output_channels, const int num_input_channels) override; +#endif /* DOXYGEN_SKIP_THIS */ + + /** Determine how much memory (in units of T) to allocate for the + * transformed weights. + * + * @param[in] num_output_channels Number of output feature maps. + * @param[in] num_input_channels Number of input feature maps. + * + * @return Storage size (in units of T) required. + */ + unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const override; + + /** Gets the stride between matrices in the input worspace + * + * @param[in] num_output_channels Number of output feature maps. + * @param[in] num_input_channels Number of input feature maps. + * + * @return Stride expressed in bytes. + */ + int get_matrix_stride(int num_output_channels, int num_input_channels) const override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + bool is_parallelisable() const override; + +private: + using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>; + using WinogradConv = typename WinogradBase::template Convolution<T, T>; + using WeightsTransform = typename WinogradBase::template WeightsTransform<T, T>; + + std::unique_ptr<WeightsTransform> _transform{ nullptr }; + int _num_output_channels; + int _matrix_stride; +}; + +/** Kernel to perform Winograd. */ +template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols> +class CpuWinogradConv2dConfiguration +{ +public: + /** Winograd base kernel */ + using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>; + /** Winograd convolution kernel */ + + using WinogradConv = typename WinogradBase::template Convolution<TIn, TOut>; + + using TransformInputKernel = CpuWinogradConv2dTransformInputKernel<TIn, OutputTileRows, OutputTileCols, KernelRows, KernelCols>; + using TransformWeightsKernel = CpuWinogradConv2dTransformWeightsKernel<TIn, OutputTileRows, OutputTileCols, KernelRows, KernelCols>; + using TransformOutputKernel = CpuWinogradConv2dTransformOutputKernel<TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>; +}; + +} // namespace cpu +} // namespace arm_compute +#endif /*ARM_COMPUTE_CPUWINOGRADCONV2DKERNEL_H*/ diff --git a/src/cpu/kernels/activation/list.h b/src/cpu/kernels/activation/list.h new file mode 100644 index 0000000000..409d025db0 --- /dev/null +++ b/src/cpu/kernels/activation/list.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H +#define SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_ACTIVATION_KERNEL(func_name) \ + void func_name(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) + +DECLARE_ACTIVATION_KERNEL(qasymm8_neon_activation); +DECLARE_ACTIVATION_KERNEL(qasymm8_sve_activation); +DECLARE_ACTIVATION_KERNEL(qasymm8_signed_neon_activation); +DECLARE_ACTIVATION_KERNEL(qasymm8_signed_sve_activation); +DECLARE_ACTIVATION_KERNEL(qsymm16_neon_activation); +DECLARE_ACTIVATION_KERNEL(qsymm16_sve_activation); +DECLARE_ACTIVATION_KERNEL(fp16_neon_activation); +DECLARE_ACTIVATION_KERNEL(fp16_sve_activation); +DECLARE_ACTIVATION_KERNEL(fp32_neon_activation); +DECLARE_ACTIVATION_KERNEL(fp32_sve_activation); + +#undef DECLARE_ACTIVATION_KERNEL +} // namespace cpu +} // namespace arm_compute + +#endif /* SRC_CORE_NEON_KERNELS_ACTIVATION_LIST_H */ diff --git a/src/cpu/kernels/activation/neon/fp16.cpp b/src/cpu/kernels/activation/neon/fp16.cpp new file mode 100644 index 0000000000..6f2d5d8533 --- /dev/null +++ b/src/cpu/kernels/activation/neon/fp16.cpp @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/NEON/NEMath.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <arm_neon.h> +#include <cmath> +#include <cstddef> + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +#ifndef __aarch64__ +inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &mask) +{ + auto int_in = vreinterpretq_u16_f16(in); + return vreinterpretq_f16_u16(wrapper::vand(int_in, mask)); +} +#endif /* __aarch64__ */ +} // namespace + +void fp16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>; + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + constexpr int window_step_x = 8; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + // In case of non-aarch64, a small delta value is added to the input + // to prevent NAN values caused by zeros in inputs to SQRT. + // In case of aarh64, we call vsqrt directly, so we don't use delta. +#ifndef __aarch64__ + const auto delta = wrapper::vdup_n(static_cast<float16_t>((1e-7), ExactTagType {})); +#endif /* __aarch64__ */ + + const auto const_1 = wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{}); + const auto const_0 = wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{}); + const auto const_6 = wrapper::vdup_n(static_cast<float16_t>(6.f), ExactTagType{}); + const auto const_3 = wrapper::vdup_n(static_cast<float16_t>(3.f), ExactTagType{}); + const auto const_inv_6 = wrapper::vdup_n(static_cast<float16_t>(0.166666667f), ExactTagType{}); + + constexpr float soft_relu_thresh = 12.f; + const auto vsoft_relu_thresh = wrapper::vdup_n(static_cast<float16_t>(soft_relu_thresh), ExactTagType{}); + + const auto va = wrapper::vdup_n(static_cast<float16_t>(act_info.a()), ExactTagType{}); + const auto vb = wrapper::vdup_n(static_cast<float16_t>(act_info.b()), ExactTagType{}); + const auto a = static_cast<float16_t>(act_info.a()); + const auto b = static_cast<float16_t>(act_info.b()); + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr()); + + wrapper::traits::neon_bitvector_t<float16_t, wrapper::traits::BitWidth::W128> tmp; + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(input_ptr + x); + switch(act) + { + case ActivationLayerInfo::ActivationFunction::ABS: + tmp = wrapper::vabs(vin); + break; + case ActivationLayerInfo::ActivationFunction::LINEAR: + tmp = wrapper::vmla(vb, va, vin); + break; + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin)))); + break; + case ActivationLayerInfo::ActivationFunction::RELU: + tmp = wrapper::vmax(const_0, vin); + break; + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + tmp = wrapper::vmin(va, wrapper::vmax(vb, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: + tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin)); + break; + case ActivationLayerInfo::ActivationFunction::SOFT_RELU: + tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin)))); + break; + case ActivationLayerInfo::ActivationFunction::ELU: + tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1))); + break; + case ActivationLayerInfo::ActivationFunction::SQRT: +#ifdef __aarch64__ + tmp = wrapper::vsqrt(vin); +#else /* __aarch64__ */ + { + const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0, ExactTagType{})); + tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask)))); + tmp = mask_float_vector(tmp, wrapper::vnot(bitmask)); + } +#endif /* __aarch64__ */ + break; + case ActivationLayerInfo::ActivationFunction::SQUARE: + tmp = wrapper::vmul(vin, vin); + break; + case ActivationLayerInfo::ActivationFunction::TANH: + tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin))); + break; + case ActivationLayerInfo::ActivationFunction::IDENTITY: + tmp = vin; + break; + case ActivationLayerInfo::ActivationFunction::HARD_SWISH: + tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3))))); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + wrapper::vstore(output_ptr + x, tmp); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float16_t in = *(reinterpret_cast<const float16_t *>(input_ptr + x)); + float16_t tmp; + switch(act) + { + case ActivationLayerInfo::ActivationFunction::ABS: + tmp = std::abs(in); + break; + case ActivationLayerInfo::ActivationFunction::LINEAR: + tmp = a * in + b; + break; + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + tmp = static_cast<float16_t>(1) / (static_cast<float16_t>(1) + std::exp(-in)); + break; + case ActivationLayerInfo::ActivationFunction::RELU: + tmp = std::max<float16_t>(static_cast<float16_t>(0), in); + break; + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + tmp = std::min<float16_t>(a, std::max(static_cast<float16_t>(0), in)); + break; + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + tmp = std::min<float16_t>(a, std::max<float16_t>(b, in)); + break; + case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: + tmp = (in > 0) ? in : a * in; + break; + case ActivationLayerInfo::ActivationFunction::SOFT_RELU: + tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<float16_t>(1) + std::exp(in)); + break; + case ActivationLayerInfo::ActivationFunction::ELU: + tmp = (in >= 0) ? in : a * (std::exp(in) - 1); + break; + case ActivationLayerInfo::ActivationFunction::SQRT: + tmp = std::sqrt(in); + break; + case ActivationLayerInfo::ActivationFunction::SQUARE: + tmp = in * in; + break; + case ActivationLayerInfo::ActivationFunction::TANH: + tmp = a * std::tanh(b * in); + break; + case ActivationLayerInfo::ActivationFunction::IDENTITY: + tmp = in; + break; + case ActivationLayerInfo::ActivationFunction::HARD_SWISH: + tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + *(output_ptr + x) = tmp; + } + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute + +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/activation/neon/fp32.cpp b/src/cpu/kernels/activation/neon/fp32.cpp new file mode 100644 index 0000000000..54301d45ad --- /dev/null +++ b/src/cpu/kernels/activation/neon/fp32.cpp @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <arm_neon.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +#ifndef __aarch64__ +inline float32x4_t mask_float_vector(const float32x4_t &in, const uint32x4_t &mask) +{ + auto int_in = vreinterpretq_u32_f32(in); + return vreinterpretq_f32_u32(wrapper::vand(int_in, mask)); +} +#endif /* __aarch64__ */ +} // namespace + +void fp32_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + /** SIMD vector tag type. */ + using ExactTagType = typename arm_compute::wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>; + + constexpr int window_step_x = 4; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + // In case of non-aarch64, a small delta value is added to the input + // to prevent NAN values caused by zeros in inputs to SQRT. + // In case of aarh64, we call vsqrt directly, so we don't use delta. +#ifndef __aarch64__ + const auto delta = wrapper::vdup_n(static_cast<float>(1e-24), ExactTagType {}); +#endif /* __aarch64__ */ + const auto const_1 = wrapper::vdup_n(static_cast<float>(1.f), ExactTagType {}); + const auto const_0 = wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{}); + const auto const_6 = wrapper::vdup_n(static_cast<float>(6.f), ExactTagType{}); + const auto const_3 = wrapper::vdup_n(static_cast<float>(3.f), ExactTagType{}); + const auto const_inv_6 = wrapper::vdup_n(static_cast<float>(0.166666667f), ExactTagType{}); + + constexpr float soft_relu_thresh = 12.f; + const auto vsoft_relu_thresh = wrapper::vdup_n(static_cast<float>(soft_relu_thresh), ExactTagType{}); + + const auto va = wrapper::vdup_n(static_cast<float>(act_info.a()), ExactTagType{}); + const auto vb = wrapper::vdup_n(static_cast<float>(act_info.b()), ExactTagType{}); + const auto a = static_cast<float>(act_info.a()); + const auto b = static_cast<float>(act_info.b()); + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const float *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float *>(output.ptr()); + + wrapper::traits::neon_bitvector_t<float, wrapper::traits::BitWidth::W128> tmp; + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(input_ptr + x); + switch(act) + { + case ActivationLayerInfo::ActivationFunction::ABS: + tmp = wrapper::vabs(vin); + break; + case ActivationLayerInfo::ActivationFunction::LINEAR: + tmp = wrapper::vmla(vb, va, vin); + break; + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin)))); + break; + case ActivationLayerInfo::ActivationFunction::RELU: + tmp = wrapper::vmax(const_0, vin); + break; + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + tmp = wrapper::vmin(va, wrapper::vmax(vb, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: + tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin)); + break; + case ActivationLayerInfo::ActivationFunction::SOFT_RELU: + tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin)))); + break; + case ActivationLayerInfo::ActivationFunction::ELU: + tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1))); + break; + case ActivationLayerInfo::ActivationFunction::SQRT: +#ifdef __aarch64__ + tmp = wrapper::vsqrt(vin); +#else /* __aarch64__ */ + { + const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0.f, ExactTagType{})); + tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask)))); + tmp = mask_float_vector(tmp, wrapper::vnot(bitmask)); + } +#endif /* __aarch64__ */ + break; + case ActivationLayerInfo::ActivationFunction::SQUARE: + tmp = wrapper::vmul(vin, vin); + break; + case ActivationLayerInfo::ActivationFunction::TANH: + tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin))); + break; + case ActivationLayerInfo::ActivationFunction::IDENTITY: + tmp = vin; + break; + case ActivationLayerInfo::ActivationFunction::HARD_SWISH: + tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3))))); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + wrapper::vstore(output_ptr + x, tmp); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float in = *(reinterpret_cast<const float *>(input_ptr + x)); + float tmp; + switch(act) + { + case ActivationLayerInfo::ActivationFunction::ABS: + tmp = std::abs(in); + break; + case ActivationLayerInfo::ActivationFunction::LINEAR: + tmp = a * in + b; + break; + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + tmp = static_cast<float>(1) / (static_cast<float>(1) + std::exp(-in)); + break; + case ActivationLayerInfo::ActivationFunction::RELU: + tmp = std::max<float>(static_cast<float>(0), in); + break; + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + tmp = std::min<float>(a, std::max(static_cast<float>(0), in)); + break; + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + tmp = std::min<float>(a, std::max<float>(b, in)); + break; + case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: + tmp = (in > 0) ? in : a * in; + break; + case ActivationLayerInfo::ActivationFunction::SOFT_RELU: + tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<float>(1) + std::exp(in)); + break; + case ActivationLayerInfo::ActivationFunction::ELU: + tmp = (in >= 0) ? in : a * (std::exp(in) - 1); + break; + case ActivationLayerInfo::ActivationFunction::SQRT: + tmp = std::sqrt(in); + break; + case ActivationLayerInfo::ActivationFunction::SQUARE: + tmp = in * in; + break; + case ActivationLayerInfo::ActivationFunction::TANH: + tmp = a * std::tanh(b * in); + break; + case ActivationLayerInfo::ActivationFunction::IDENTITY: + tmp = in; + break; + case ActivationLayerInfo::ActivationFunction::HARD_SWISH: + tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + *(output_ptr + x) = tmp; + } + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/activation/neon/qasymm8.cpp b/src/cpu/kernels/activation/neon/qasymm8.cpp new file mode 100644 index 0000000000..a1217435b6 --- /dev/null +++ b/src/cpu/kernels/activation/neon/qasymm8.cpp @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <arm_neon.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace cpu +{ +void qasymm8_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + constexpr int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); + const qasymm8x16_t va = vdupq_n_u8(quantize_qasymm8(act_info.a(), qi_in)); + const qasymm8x16_t vb = vdupq_n_u8(quantize_qasymm8(act_info.b(), qi_in)); + const qasymm8_t a = quantize_qasymm8(act_info.a(), qi_in); + const qasymm8_t b = quantize_qasymm8(act_info.b(), qi_in); + const qasymm8_t const_0 = quantize_qasymm8(0.f, qi_in); + const qasymm8x16_t vconst_0 = vdupq_n_u8(const_0); + const auto vconst_1 = vdupq_n_f32(1.f); +#ifndef __aarch64__ + const auto vconst_0_f32 = vdupq_n_f32(0); +#endif // __aarch64__ + const float32x4_t va_f32 = vdupq_n_f32(act_info.a()); + const float32x4_t vb_f32 = vdupq_n_f32(act_info.b()); + const float a_f32 = act_info.a(); + const float b_f32 = act_info.b(); + const auto const_6_f32 = vdupq_n_f32(6.f); + const auto const_0_f32 = vdupq_n_f32(0.f); + const auto const_3_f32 = vdupq_n_f32(3.f); + const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f); + + // Initialise scale/offset for re-quantization + float s = qi_in.scale / qi_out.scale; + float o = -qi_in.offset * s + qi_out.offset; + float32x4_t vs = vdupq_n_f32(s); + float32x4_t vo = vdupq_n_f32(o); + + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const qasymm8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr()); + + wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp; + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(input_ptr + x); + if(act == ActivationLayerInfo::ActivationFunction::RELU) + { + // Perform activation + tmp = vmaxq_u8(vconst_0, vin); + // Re-quantize to new output space + tmp = vmlaq_qasymm8(tmp, vs, vo); + } + else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + // Perform activation + tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin)); + // Re-quantize to new output space + tmp = vmlaq_qasymm8(tmp, vs, vo); + } + else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + // Perform activation + tmp = vminq_u8(va, vmaxq_u8(vb, vin)); + // Re-quantize to new output space + tmp = vmlaq_qasymm8(tmp, vs, vo); + } + else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = + { + { + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))), + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))), + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))), + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))), + } + }; + // Re-quantize to new output space + tmp = vquantize(tmp_dep, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::TANH) + { + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = + { + { + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))), + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))), + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))), + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))), + } + }; + // Re-quantize to new output space + tmp = vquantize(tmp_dep, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) + { + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = + { + { + wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))), + wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))), + wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))), + wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))), + } + }; + // Re-quantize to new output space + tmp = vquantize(tmp_dep, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + const auto vin_deq = vdequantize(vin, qi_in); + +#ifdef __aarch64__ + const uint32x4x4_t pos_mask = + { + { + wrapper::vcgtz(vin_deq.val[0]), + wrapper::vcgtz(vin_deq.val[1]), + wrapper::vcgtz(vin_deq.val[2]), + wrapper::vcgtz(vin_deq.val[3]), + } + }; +#else // __aarch64__ + const uint32x4x4_t pos_mask = + { + { + wrapper::vcgt(vin_deq.val[0], vconst_0_f32), + wrapper::vcgt(vin_deq.val[1], vconst_0_f32), + wrapper::vcgt(vin_deq.val[2], vconst_0_f32), + wrapper::vcgt(vin_deq.val[3], vconst_0_f32), + } + }; +#endif // __aarch64__ + + const float32x4x4_t tmp_dep = + { + { + wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])), + wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])), + wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])), + wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])), + } + }; + + tmp = vquantize(tmp_dep, qi_out); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + wrapper::vstore(output_ptr + x, tmp); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + qasymm8_t in = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x)); + qasymm8_t tmp = 0; + if(act == ActivationLayerInfo::ActivationFunction::RELU) + { + tmp = std::max(const_0, in); + tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o); + } + else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + tmp = std::min(a, std::max(const_0, in)); + tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o); + } + else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + tmp = std::min(a, std::max(b, in)); + tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o); + } + else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + float tmp_f = dequantize_qasymm8(in, qi_in); + tmp_f = 1.f / (1.f + std::exp(-tmp_f)); + tmp = quantize_qasymm8(tmp_f, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::TANH) + { + float tmp_f = dequantize_qasymm8(in, qi_in); + tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); + tmp = quantize_qasymm8(tmp_f, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) + { + float tmp_f = dequantize_qasymm8(in, qi_in); + tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f); + tmp = quantize_qasymm8(tmp_f, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + float tmp_f = dequantize_qasymm8(in, qi_in); + tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32; + tmp = quantize_qasymm8(tmp_f, qi_out); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + *(output_ptr + x) = tmp; + } + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/activation/neon/qasymm8_signed.cpp b/src/cpu/kernels/activation/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..8b40bf8e72 --- /dev/null +++ b/src/cpu/kernels/activation/neon/qasymm8_signed.cpp @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <arm_neon.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace cpu +{ +void qasymm8_signed_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + constexpr int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); + const qasymm8x16_signed_t va = vdupq_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in)); + const qasymm8x16_signed_t vb = vdupq_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in)); + const qasymm8_signed_t a = quantize_qasymm8_signed(act_info.a(), qi_in); + const qasymm8_signed_t b = quantize_qasymm8_signed(act_info.b(), qi_in); + const qasymm8_signed_t const_0 = quantize_qasymm8_signed(0.f, qi_in); + const qasymm8x16_signed_t vconst_0 = vdupq_n_s8(const_0); + const auto vconst_1 = vdupq_n_f32(1.f); +#ifndef __aarch64__ + const auto vconst_0_f32 = vdupq_n_f32(1.f); +#endif // __aarch64__ + const float32x4_t va_f32 = vdupq_n_f32(act_info.a()); + const float32x4_t vb_f32 = vdupq_n_f32(act_info.b()); + const float a_f32 = act_info.a(); + const float b_f32 = act_info.b(); + const auto const_6_f32 = vdupq_n_f32(6.f); + const auto const_0_f32 = vdupq_n_f32(0.f); + const auto const_3_f32 = vdupq_n_f32(3.f); + const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f); + + // Initialise scale/offset for re-quantization + float s = qi_in.scale / qi_out.scale; + float o = -qi_in.offset * s + qi_out.offset; + float32x4_t vs = vdupq_n_f32(s); + float32x4_t vo = vdupq_n_f32(o); + + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const qasymm8_signed_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<qasymm8_signed_t *>(output.ptr()); + + wrapper::traits::neon_bitvector_t<qasymm8_signed_t, wrapper::traits::BitWidth::W128> tmp; + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(input_ptr + x); + if(act == ActivationLayerInfo::ActivationFunction::RELU) + { + // Perform activation + tmp = vmaxq_s8(vconst_0, vin); + // Re-quantize to new output space + tmp = vmlaq_qasymm8_signed(tmp, vs, vo); + } + else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + // Perform activation + tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin)); + // Re-quantize to new output space + tmp = vmlaq_qasymm8_signed(tmp, vs, vo); + } + else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + // Perform activation + tmp = vminq_s8(va, vmaxq_s8(vb, vin)); + // Re-quantize to new output space + tmp = vmlaq_qasymm8_signed(tmp, vs, vo); + } + else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = + { + { + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))), + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))), + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))), + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))), + } + }; + // Re-quantize to new output space + tmp = vquantize_signed(tmp_dep, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::TANH) + { + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = + { + { + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))), + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))), + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))), + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))), + } + }; + // Re-quantize to new output space + tmp = vquantize_signed(tmp_dep, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) + { + // De-quantize + const auto vin_deq = vdequantize(vin, qi_in); + // Perform activation + const float32x4x4_t tmp_dep = + { + { + wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))), + wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))), + wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))), + wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))), + } + }; + // Re-quantize to new output space + tmp = vquantize_signed(tmp_dep, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + const auto vin_deq = vdequantize(vin, qi_in); + +#ifdef __aarch64__ + const uint32x4x4_t pos_mask = + { + { + wrapper::vcgtz(vin_deq.val[0]), + wrapper::vcgtz(vin_deq.val[1]), + wrapper::vcgtz(vin_deq.val[2]), + wrapper::vcgtz(vin_deq.val[3]), + } + }; +#else // __aarch64__ + const uint32x4x4_t pos_mask = + { + { + wrapper::vcgt(vin_deq.val[0], vconst_0_f32), + wrapper::vcgt(vin_deq.val[1], vconst_0_f32), + wrapper::vcgt(vin_deq.val[2], vconst_0_f32), + wrapper::vcgt(vin_deq.val[3], vconst_0_f32), + } + }; +#endif // __aarch64__ + + const float32x4x4_t tmp_dep = + { + { + wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])), + wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])), + wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])), + wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])), + } + }; + + tmp = vquantize_signed(tmp_dep, qi_out); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + wrapper::vstore(output_ptr + x, tmp); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + qasymm8_signed_t in = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x)); + qasymm8_signed_t tmp = 0; + if(act == ActivationLayerInfo::ActivationFunction::RELU) + { + tmp = std::max(const_0, in); + tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o); + } + else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + tmp = std::min(a, std::max(const_0, in)); + tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o); + } + else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + tmp = std::min(a, std::max(b, in)); + tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o); + } + else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + tmp_f = 1.f / (1.f + std::exp(-tmp_f)); + tmp = quantize_qasymm8_signed(tmp_f, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::TANH) + { + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); + tmp = quantize_qasymm8_signed(tmp_f, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) + { + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f); + tmp = quantize_qasymm8_signed(tmp_f, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + float tmp_f = dequantize_qasymm8_signed(in, qi_in); + tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32; + tmp = quantize_qasymm8_signed(tmp_f, qi_out); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + *(output_ptr + x) = tmp; + } + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/activation/neon/qsymm16.cpp b/src/cpu/kernels/activation/neon/qsymm16.cpp new file mode 100644 index 0000000000..54b41820f2 --- /dev/null +++ b/src/cpu/kernels/activation/neon/qsymm16.cpp @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/experimental/Types.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/NESymm.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <arm_neon.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace cpu +{ +void qsymm16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + constexpr int window_step_x = 8; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); + const auto vconst_1 = vdupq_n_f32(1.f); + const float32x4_t va_f32 = vdupq_n_f32(act_info.a()); + const float32x4_t vb_f32 = vdupq_n_f32(act_info.b()); + const float a_f32 = act_info.a(); + const float b_f32 = act_info.b(); + + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const qsymm16_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr()); + + wrapper::traits::neon_bitvector_t<qsymm16_t, wrapper::traits::BitWidth::W128> tmp; + ARM_COMPUTE_UNUSED(tmp); + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(input_ptr + x); + if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + // De-quantize + const auto vin_deq = vdequantize_int16(vin, qi_in.scale); + // Perform activation + const float32x4x2_t tmp_dep = + { + { + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))), + wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))), + } + }; + // Re-quantize to new output space + tmp = vquantize_int16(tmp_dep, qi_out.scale); + } + else if(act == ActivationLayerInfo::ActivationFunction::TANH) + { + // De-quantize + const auto vin_deq = vdequantize_int16(vin, qi_in.scale); + // Perform activation + const float32x4x2_t tmp_dep = + { + { + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))), + wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))), + } + }; + // Re-quantize to new output space + tmp = vquantize_int16(tmp_dep, qi_out.scale); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + wrapper::vstore(output_ptr + x, tmp); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + qsymm16_t in = *(reinterpret_cast<const qsymm16_t *>(input_ptr + x)); + qsymm16_t tmp = 0; + if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + float tmp_f = dequantize_qsymm16(in, qi_in.scale); + tmp_f = 1.f / (1.f + std::exp(-tmp_f)); + tmp = quantize_qsymm16(tmp_f, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::TANH) + { + float tmp_f = dequantize_qsymm16(in, qi_in.scale); + tmp_f = a_f32 * std::tanh(b_f32 * tmp_f); + tmp = quantize_qsymm16(tmp_f, qi_out); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + *(output_ptr + x) = tmp; + } + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/activation/sve/fp16.cpp b/src/cpu/kernels/activation/sve/fp16.cpp new file mode 100644 index 0000000000..5e76e82c52 --- /dev/null +++ b/src/cpu/kernels/activation/sve/fp16.cpp @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_SVE) +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" + +#include <cmath> +#include <cstddef> + +#include "src/core/NEON/SVEMath.h" +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +void fp16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const auto const_1 = svdup_n_f16(1.f); + const auto const_0 = svdup_n_f16(0.f); + const auto const_6 = svdup_n_f16(6.f); + const auto const_3 = svdup_n_f16(3.f); + const auto const_inv_6 = svdup_n_f16(0.166666667f); + + const auto va = svdup_n_f16(act_info.a()); + const auto vb = svdup_n_f16(act_info.b()); + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr()); + + svfloat16_t tmp; + + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do + { + const auto vin = svld1_f16(pg, input_ptr + x); + switch(act) + { + case ActivationLayerInfo::ActivationFunction::ABS: + tmp = svabs_f16_z(pg, vin); + break; + case ActivationLayerInfo::ActivationFunction::LINEAR: + tmp = svmla_f16_z(pg, vb, va, vin); + break; + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin)))); + break; + case ActivationLayerInfo::ActivationFunction::RELU: + tmp = svmax_f16_z(pg, const_0, vin); + break; + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: + tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va), svmax_f16_z(pg, vin, const_0)); + break; + case ActivationLayerInfo::ActivationFunction::SOFT_RELU: + tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin))); + break; + case ActivationLayerInfo::ActivationFunction::ELU: + tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin, svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1))); + break; + case ActivationLayerInfo::ActivationFunction::SQRT: + tmp = svsqrt_f16_z(pg, vin); + break; + case ActivationLayerInfo::ActivationFunction::SQUARE: + tmp = svmul_f16_z(pg, vin, vin); + break; + case ActivationLayerInfo::ActivationFunction::TANH: + tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin))); + break; + case ActivationLayerInfo::ActivationFunction::IDENTITY: + tmp = vin; + break; + case ActivationLayerInfo::ActivationFunction::HARD_SWISH: + tmp = svmul_f16_z(pg, vin, svmul_f16_z(pg, const_inv_6, svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3))))); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + svst1_f16(pg, output_ptr + x, tmp); + + x += svcnth(); + pg = svwhilelt_b16(x, window_end_x); + + } + while(svptest_any(svptrue_b16(), pg)); + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_SVE) */
\ No newline at end of file diff --git a/src/cpu/kernels/activation/sve/fp32.cpp b/src/cpu/kernels/activation/sve/fp32.cpp new file mode 100644 index 0000000000..cb9f82eb39 --- /dev/null +++ b/src/cpu/kernels/activation/sve/fp32.cpp @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_SVE) +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/SVEMath.h" + +#include <cmath> +#include <cstddef> + +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +void fp32_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const auto const_1 = svdup_n_f32(1.f); + const auto const_0 = svdup_n_f32(0.f); + const auto const_6 = svdup_n_f32(6.f); + const auto const_3 = svdup_n_f32(3.f); + const auto const_inv_6 = svdup_n_f32(0.166666667f); + + const auto va = svdup_n_f32(act_info.a()); + const auto vb = svdup_n_f32(act_info.b()); + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const float *>(input.ptr()); + const auto output_ptr = reinterpret_cast<float *>(output.ptr()); + + svfloat32_t tmp; + + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b32(x, window_end_x); + do + { + const auto vin = svld1_f32(pg, input_ptr + x); + switch(act) + { + case ActivationLayerInfo::ActivationFunction::ABS: + tmp = svabs_f32_z(pg, vin); + break; + case ActivationLayerInfo::ActivationFunction::LINEAR: + tmp = svmla_f32_z(pg, vb, va, vin); + break; + case ActivationLayerInfo::ActivationFunction::LOGISTIC: + tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin)))); + break; + case ActivationLayerInfo::ActivationFunction::RELU: + tmp = svmax_f32_z(pg, const_0, vin); + break; + case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: + tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: + tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin)); + break; + case ActivationLayerInfo::ActivationFunction::LEAKY_RELU: + tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va), svmax_f32_z(pg, vin, const_0)); + break; + case ActivationLayerInfo::ActivationFunction::SOFT_RELU: + tmp = svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin))); + break; + case ActivationLayerInfo::ActivationFunction::ELU: + tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin, svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1))); + break; + case ActivationLayerInfo::ActivationFunction::SQRT: + tmp = svsqrt_f32_z(pg, vin); + break; + case ActivationLayerInfo::ActivationFunction::SQUARE: + tmp = svmul_f32_z(pg, vin, vin); + break; + case ActivationLayerInfo::ActivationFunction::TANH: + tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin))); + break; + case ActivationLayerInfo::ActivationFunction::IDENTITY: + tmp = vin; + break; + case ActivationLayerInfo::ActivationFunction::HARD_SWISH: + tmp = svmul_f32_z(pg, vin, svmul_f32_z(pg, const_inv_6, svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3))))); + break; + default: + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + svst1_f32(pg, output_ptr + x, tmp); + + x += svcntw(); + pg = svwhilelt_b32(x, window_end_x); + + } + while(svptest_any(svptrue_b32(), pg)); + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_SVE) */
\ No newline at end of file diff --git a/src/cpu/kernels/activation/sve/qasymm8.cpp b/src/cpu/kernels/activation/sve/qasymm8.cpp new file mode 100644 index 0000000000..69fffd96c5 --- /dev/null +++ b/src/cpu/kernels/activation/sve/qasymm8.cpp @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE2) +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" + +#include <cmath> +#include <cstddef> + +#include "src/core/NEON/SVEAsymm.h" +#include "src/core/NEON/SVEMath.h" +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +void qasymm8_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); + const auto va = svdup_n_u8(quantize_qasymm8(act_info.a(), qi_in)); + const auto vb = svdup_n_u8(quantize_qasymm8(act_info.b(), qi_in)); + const auto const_0 = quantize_qasymm8(0.f, qi_in); + const auto vconst_0 = svdup_n_u8(const_0); + const auto vconst_1 = svdup_n_f32(1.f); + const auto va_f32 = svdup_n_f32(act_info.a()); + const auto vb_f32 = svdup_n_f32(act_info.b()); + const auto const_6_f32 = svdup_n_f32(6.f); + const auto const_0_f32 = svdup_n_f32(0.f); + const auto const_3_f32 = svdup_n_f32(3.f); + const auto const_inv_6_f32 = svdup_n_f32(0.166666667f); + + // Initialise scale/offset for re-quantization + bool requant = true; + if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset) + { + requant = false; + } + float s = qi_in.scale / qi_out.scale; + float o = -qi_in.offset * s + qi_out.offset; + auto vs = svdup_n_f32(s); + auto vo = svdup_n_f32(o); + + // Initialise scale/offset for re-quantization with int32_t + const auto voffset_in = svdup_n_s32(qi_in.offset); + int32_t s_s32 = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + int32_t o_s32 = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + const auto vs_s32 = svdup_n_s32(s_s32); + const auto vo_s32 = svdup_n_s32(o_s32); + + // Initialise scale/offset for re-quantization for leaky relu + int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8), + arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32); + const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32); + + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + svuint8_t tmp; + + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + const auto vin = svld1_u8(pg, input_ptr + x); + if(act == ActivationLayerInfo::ActivationFunction::RELU) + { + // Perform activation + tmp = svmax_u8_z(pg, vconst_0, vin); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp; + } + else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + // Perform activation + tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin)); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp; + } + else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + // Perform activation + tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin)); + // Re-quantize to new output space + tmp = svmla_qasymm8_z(pg, tmp, vs, vo); + } + else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = + { + { { + svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))), + svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))), + svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))), + svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))), + } + } + }; + // Re-quantize to new output space + tmp = svquantize_z(pg, tmp_dep, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::TANH) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = + { + { { + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))), + } + } + }; + // Re-quantize to new output space + tmp = svquantize_z(pg, tmp_dep, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = + { + { { + svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))), + svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))), + svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))), + svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))), + } + } + }; + // Re-quantize to new output space + tmp = svquantize_z(pg, tmp_dep, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + svbool_t p0, p1, p2, p3; + svint32x4_t tmp_dep; + + // Expand to int32 + const svint32x4_t vin_s32 = + { + { { + svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))), + svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))), + svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))), + svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))), + } + } + }; + + // Compare elements to input offset + if(qi_in.scale >= 0) + { + p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); + p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); + p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); + p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + } + else + { + p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); + p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); + p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); + p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + } + + // Multiply negative elements and requantize if necessary + if(requant) + { + tmp_dep = svcreate4_s32( + svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8), + svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8), + svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8), + svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8)); + } + else + { + tmp_dep = svcreate4_s32( + svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8)); + } + + // Convert uint32 vectors to uint16 vectors (with saturation) + const auto v_low_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1)); + const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3)); + + // convert uint16 vectors to uint8 vectors (with saturation) + tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + + svst1_u8(pg, output_ptr + x, tmp); + + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); + + } + while(svptest_any(svptrue_b8(), pg)); + + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
\ No newline at end of file diff --git a/src/cpu/kernels/activation/sve/qasymm8_signed.cpp b/src/cpu/kernels/activation/sve/qasymm8_signed.cpp new file mode 100644 index 0000000000..53ee515ff9 --- /dev/null +++ b/src/cpu/kernels/activation/sve/qasymm8_signed.cpp @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <cmath> +#include <cstddef> + +#if defined(ARM_COMPUTE_ENABLE_SVE2) +#include "src/core/NEON/SVEAsymm.h" +#include "src/core/NEON/SVEMath.h" +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +void qasymm8_signed_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); + const auto va = svdup_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in)); + const auto vb = svdup_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in)); + const auto const_0 = quantize_qasymm8_signed(0.f, qi_in); + const auto vconst_0 = svdup_n_s8(const_0); + const auto vconst_1 = svdup_n_f32(1.f); + const auto va_f32 = svdup_n_f32(act_info.a()); + const auto vb_f32 = svdup_n_f32(act_info.b()); + const auto const_6_f32 = svdup_n_f32(6.f); + const auto const_0_f32 = svdup_n_f32(0.f); + const auto const_3_f32 = svdup_n_f32(3.f); + const auto const_inv_6_f32 = svdup_n_f32(0.166666667f); + + // Initialise scale/offset for re-quantization + bool requant = true; + if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset) + { + requant = false; + } + float s = qi_in.scale / qi_out.scale; + float o = -qi_in.offset * s + qi_out.offset; + auto vs = svdup_n_f32(s); + auto vo = svdup_n_f32(o); + + // Initialise scale/offset for re-quantization with int32_t + const auto voffset_in = svdup_n_s32(qi_in.offset); + int32_t s_s32 = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + int32_t o_s32 = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + const auto vs_s32 = svdup_n_s32(s_s32); + const auto vo_s32 = svdup_n_s32(o_s32); + + // Initialise scale/offset for re-quantization for leaky relu + int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8), + arm_compute::RoundingPolicy::TO_NEAREST_EVEN); + const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32); + const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32); + + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + svint8_t tmp; + + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + const auto vin = svld1_s8(pg, input_ptr + x); + if(act == ActivationLayerInfo::ActivationFunction::RELU) + { + // Perform activation + tmp = svmax_s8_z(pg, vconst_0, vin); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; + } + else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + // Perform activation + tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin)); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; + } + else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + // Perform activation + tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin)); + // Re-quantize to new output space + tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp; + } + else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = + { + { { + svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))), + svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))), + svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))), + svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))), + } + } + }; + // Re-quantize to new output space + tmp = svquantize_signed_z(pg, tmp_dep, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::TANH) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = + { + { { + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))), + } + } + }; + // Re-quantize to new output space + tmp = svquantize_signed_z(pg, tmp_dep, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) + { + // De-quantize + const auto vin_deq = svdequantize_z(pg, vin, qi_in); + // Perform activation + const svfloat32x4_t tmp_dep = + { + { { + svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))), + svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))), + svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))), + svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))), + } + } + }; + // Re-quantize to new output space + tmp = svquantize_signed_z(pg, tmp_dep, qi_out); + } + else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU) + { + svbool_t p0, p1, p2, p3; + svint32x4_t tmp_dep; + + // Expand to int32 + const svint32x4_t vin_s32 = + { + { { + svmovlb_s32(svmovlb_s16(vin)), + svmovlt_s32(svmovlb_s16(vin)), + svmovlb_s32(svmovlt_s16(vin)), + svmovlt_s32(svmovlt_s16(vin)), + } + } + }; + + // Compare elements to input offset + if(qi_in.scale >= 0) + { + p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); + p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); + p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); + p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + } + else + { + p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in); + p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in); + p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in); + p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in); + } + + // Multiply negative elements and requantize if necessary + if(requant) + { + tmp_dep = svcreate4_s32( + svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8), + svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8), + svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8), + svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8)); + } + else + { + tmp_dep = svcreate4_s32( + svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8), + svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8)); + } + + // Convert uint32 vectors to uint16 vectors (with saturation) + const auto v_low_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1)); + const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3)); + + // convert uint16 vectors to uint8 vectors (with saturation) + tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + + svst1_s8(pg, output_ptr + x, tmp); + + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); + + } + while(svptest_any(svptrue_b8(), pg)); + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ diff --git a/src/cpu/kernels/activation/sve/qsymm16.cpp b/src/cpu/kernels/activation/sve/qsymm16.cpp new file mode 100644 index 0000000000..ac549770a2 --- /dev/null +++ b/src/cpu/kernels/activation/sve/qsymm16.cpp @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/experimental/Types.h" + +#include <cmath> +#include <cstddef> + +#if defined(ARM_COMPUTE_ENABLE_SVE2) +#include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/SVESymm.h" +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +void qsymm16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window) +{ + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const ActivationLayerInfo::ActivationFunction act = act_info.activation(); + + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(src, win_collapsed); + Iterator output(dst, win_collapsed); + + const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform(); + const auto vconst_1 = svdup_n_f32(1.f); + const auto va_f32 = svdup_n_f32(act_info.a()); + const auto vb_f32 = svdup_n_f32(act_info.b()); + + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + svint16_t tmp; + + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do + { + const auto vin = svld1_s16(pg, input_ptr + x); + if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + // De-quantize + auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale); + // Perform activation + const svfloat32x2_t tmp_dep = + { + { { + svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))), + svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1))))), + } + } + }; + // Re-quantize to new output space + tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale); + } + else if(act == ActivationLayerInfo::ActivationFunction::TANH) + { + // De-quantize + auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale); + // Perform activation + const svfloat32x2_t tmp_dep = + { + { { + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))), + svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32))), + } + } + }; + // Re-quantize to new output space + tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale); + } + else + { + ARM_COMPUTE_ERROR("Unsupported activation function"); + } + + svst1_s16(pg, output_ptr + x, tmp); + + x += svcnth(); + pg = svwhilelt_b16(x, window_end_x); + + } + while(svptest_any(svptrue_b16(), pg)); + }, + input, output); +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ diff --git a/src/cpu/kernels/add/neon/list.h b/src/cpu/kernels/add/neon/list.h new file mode 100644 index 0000000000..379bd32fb1 --- /dev/null +++ b/src/cpu/kernels/add/neon/list.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_ADD_LIST_H +#define SRC_CORE_NEON_KERNELS_ADD_LIST_H + +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_ADD_KERNEL(func_name) \ + void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) + +DECLARE_ADD_KERNEL(add_qasymm8_neon); +DECLARE_ADD_KERNEL(add_qasymm8_signed_neon); +DECLARE_ADD_KERNEL(add_qsymm16_neon); + +#undef DECLARE_ADD_KERNEL + +template <typename ScalarType> +void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<ScalarType, wrapper::traits::BitWidth::W128>; + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 16 / sizeof(ScalarType); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + + const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v; + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto val1 = wrapper::vloadq(input1_ptr + x); + const auto val2 = wrapper::vloadq(input2_ptr + x); + const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto val1 = *(input1_ptr + x); + const auto val2 = *(input2_ptr + x); + *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2; + } + }, + input1, input2, output); + } +} +} // namespace cpu +} // namespace arm_compute +#endif // SRC_CORE_NEON_KERNELS_ADD_LIST_H diff --git a/src/cpu/kernels/add/neon/qasymm8.cpp b/src/cpu/kernels/add/neon/qasymm8.cpp new file mode 100644 index 0000000000..e357a7ef7f --- /dev/null +++ b/src/cpu/kernels/add/neon/qasymm8.cpp @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +void add_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + + const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); + const float32x4_t voffseto = vdupq_n_f32(oq_info.offset); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + + const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale); + const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale); + const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset); + const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr()); + const uint8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value); + + const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2); + const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2); + const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2); + const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2); + + const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale; + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x); + const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1); + const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1); + const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1); + const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; + int32x4_t rf_2{}; + int32x4_t rf_3{}; + +#ifdef __aarch64__ + rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo)); + rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo)); + rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo)); +#else //__aarch64__ + rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo)); + rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo)); + rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo)); +#endif //__aarch64__ + + const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); + const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); + vst1q_u8(output_ptr + x, vcombine_u8(pa, pb)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale; + *(output_ptr + x) = quantize_qasymm8((afs + bfs), oq_info); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale); + const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); + const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset); + const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t a = vld1q_u8(input1_ptr + x); + const uint8x16_t b = vld1q_u8(input2_ptr + x); + + const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1); + const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1); + const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1); + const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1); + + const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2); + const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2); + const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2); + const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; + int32x4_t rf_2{}; + int32x4_t rf_3{}; + +#ifdef __aarch64__ + rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo)); + rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo)); + rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo)); +#else //__aarch64__ + rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo)); + rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo)); + rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo)); +#endif //__aarch64__ + + const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); + const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); + vst1q_u8(output_ptr + x, vcombine_u8(pa, pb)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale; + const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale; + *(output_ptr + x) = quantize_qasymm8((afs + bfs), oq_info); + } + }, + input1, input2, output); + } +} +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/add/neon/qasymm8_signed.cpp b/src/cpu/kernels/add/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..d62d0739f5 --- /dev/null +++ b/src/cpu/kernels/add/neon/qasymm8_signed.cpp @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +void add_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + + const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); + const float32x4_t voffseto = vdupq_n_f32(oq_info.offset); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + + const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale); + const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale); + const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset); + const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr()); + const int8x16_t broadcast_value_vec = vdupq_n_s8(broadcast_value); + + const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(broadcast_value_vec)))), voffset2)), vscale2); + const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(broadcast_value_vec)))), voffset2)), vscale2); + const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(broadcast_value_vec)))), voffset2)), vscale2); + const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(broadcast_value_vec)))), voffset2)), vscale2); + const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale; + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x); + + const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1); + const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1); + const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1); + const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; + int32x4_t rf_2{}; + int32x4_t rf_3{}; + +#ifdef __aarch64__ + rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo)); + rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo)); + rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo)); +#else //__aarch64__ + rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo)); + rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo)); + rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo)); +#endif //__aarch64__ + + const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); + const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); + vst1q_s8(output_ptr + x, vcombine_s8(pa, pb)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale; + *(output_ptr + x) = quantize_qasymm8_signed((afs + bfs), oq_info); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale); + const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); + const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset); + const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset); + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int8x16_t a = vld1q_s8(input1_ptr + x); + const int8x16_t b = vld1q_s8(input2_ptr + x); + + const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1); + const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1); + const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1); + const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1); + + const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(b)))), voffset2)), vscale2); + const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(b)))), voffset2)), vscale2); + const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(b)))), voffset2)), vscale2); + const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(b)))), voffset2)), vscale2); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; + int32x4_t rf_2{}; + int32x4_t rf_3{}; + +#ifdef __aarch64__ + rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo)); + rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo)); + rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo)); +#else //__aarch64__ + rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo)); + rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo)); + rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo)); +#endif //__aarch64__ + + const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1))); + const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3))); + vst1q_s8(output_ptr + x, vcombine_s8(pa, pb)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale; + const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale; + *(output_ptr + x) = quantize_qasymm8_signed((afs + bfs), dst->info()->quantization_info()); + } + }, + input1, input2, output); + } +} +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/add/neon/qsymm16.cpp b/src/cpu/kernels/add/neon/qsymm16.cpp new file mode 100644 index 0000000000..e76e408d6e --- /dev/null +++ b/src/cpu/kernels/add/neon/qsymm16.cpp @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 8; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + + const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale); + const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); + const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr()); + const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value); + + const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2); + const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2); + const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale; + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x); + const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1); + const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; +#ifdef __aarch64__ + rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); +#else //__aarch64__ + rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); +#endif //__aarch64__ + + const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)); + vst1q_s16(output_ptr + x, pa); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale; + *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8_t a = vld1q_s16(input1_ptr + x); + const int16x8_t b = vld1q_s16(input2_ptr + x); + + const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1); + const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1); + const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2); + const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2); + + int32x4_t rf_0{}; + int32x4_t rf_1{}; +#ifdef __aarch64__ + rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); +#else //__aarch64__ + rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo)); + rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo)); +#endif //__aarch64__ + + const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)); + vst1q_s16(output_ptr + x, pa); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale; + const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale; + *(output_ptr + x) = quantize_qsymm16((afs + bfs), dst->info()->quantization_info()); + } + }, + input1, input2, output); + } +} +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/add/sve/impl.cpp b/src/cpu/kernels/add/sve/impl.cpp new file mode 100644 index 0000000000..f8e16a508c --- /dev/null +++ b/src/cpu/kernels/add/sve/impl.cpp @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_SVE) +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + +#include "src/core/NEON/SVEMath.h" +#include "src/cpu/kernels/add/sve/impl.h" +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +template <typename ScalarType> +void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + const auto all_true_pg = wrapper::svptrue<ScalarType>(); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + const bool is_sat = (policy == ConvertPolicy::SATURATE); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape())); + Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape())); + Iterator output(dst, window); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + + const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::svdup_n(broadcast_value); + + int x = window_start_x; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x); + auto res = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) : svadd_z(pg, broadcast_value_vec, non_broadcast_v); + svst1(pg, output_ptr + x, res); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + + int x = window_start_x; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto val1 = svld1(pg, input1_ptr + x); + const auto val2 = svld1(pg, input2_ptr + x); + const auto res = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2); + svst1(pg, output_ptr + x, res); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + input1, input2, output); + } +} + +template void add_same_sve<float>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve<int16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +template void add_same_sve<int32_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_SVE) */
\ No newline at end of file diff --git a/src/cpu/kernels/add/sve/impl.h b/src/cpu/kernels/add/sve/impl.h new file mode 100644 index 0000000000..32ff5d0496 --- /dev/null +++ b/src/cpu/kernels/add/sve/impl.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_ADD_IMPL_H +#define SRC_CORE_SVE_KERNELS_ADD_IMPL_H + +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename ScalarType> +void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window); +} // namespace cpu +} // namespace arm_compute +#endif // defined(ARM_COMPUTE_ENABLE_SVE) +#endif // SRC_CORE_SVE_KERNELS_ADD_IMPL_H
\ No newline at end of file diff --git a/src/cpu/kernels/add/sve/list.h b/src/cpu/kernels/add/sve/list.h new file mode 100644 index 0000000000..4529a9f7c1 --- /dev/null +++ b/src/cpu/kernels/add/sve/list.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_ADD_LIST_H +#define SRC_CORE_SVE_KERNELS_ADD_LIST_H + +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/cpu/kernels/add/sve/impl.h" +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_ADD_KERNEL(func_name) \ + void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) + +DECLARE_ADD_KERNEL(add_qasymm8_sve); +DECLARE_ADD_KERNEL(add_qasymm8_signed_sve); +DECLARE_ADD_KERNEL(add_qsymm16_sve); + +#undef DECLARE_ADD_KERNEL + +} // namespace cpu +} // namespace arm_compute +#endif // defined(ARM_COMPUTE_ENABLE_SVE) +#endif // SRC_CORE_SVE_KERNELS_ADD_LIST_H
\ No newline at end of file diff --git a/src/cpu/kernels/add/sve/qasymm8.cpp b/src/cpu/kernels/add/sve/qasymm8.cpp new file mode 100644 index 0000000000..888ad878ca --- /dev/null +++ b/src/cpu/kernels/add/sve/qasymm8.cpp @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE2) +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +void add_qasymm8_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + const auto all_true_pg = svptrue_b8(); + + const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale); + const auto voffseto = svdup_n_f32(oq_info.offset); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + + const svfloat32_t vscale1 = is_broadcast_input_2 ? svdup_n_f32(iq1_info.scale) : svdup_n_f32(iq2_info.scale); + const svfloat32_t vscale2 = is_broadcast_input_2 ? svdup_n_f32(iq2_info.scale) : svdup_n_f32(iq1_info.scale); + const svint32_t voffset1 = is_broadcast_input_2 ? svdup_n_s32(iq1_info.offset) : svdup_n_s32(iq2_info.offset); + const svint32_t voffset2 = is_broadcast_input_2 ? svdup_n_s32(iq2_info.offset) : svdup_n_s32(iq1_info.offset); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr()); + const svuint8_t broadcast_value_vec = svdup_n_u8(broadcast_value); + + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + + const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2); + const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2); + const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2); + const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2); + + do + { + const svuint8_t a = svld1_u8(pg, non_broadcast_input_ptr + x); + + const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1); + const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1); + const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1); + const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1); + + const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); + const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); + + const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1); + const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3); + + const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb); + svst1_u8(pg, output_ptr + x, res); + + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + const auto vscale1 = svdup_n_f32(iq1_info.scale); + const auto vscale2 = svdup_n_f32(iq2_info.scale); + const auto voffset1 = svdup_n_s32(iq1_info.offset); + const auto voffset2 = svdup_n_s32(iq2_info.offset); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + const auto a = svld1_u8(pg, input1_ptr + x); + const auto b = svld1_u8(pg, input2_ptr + x); + const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1); + const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1); + const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1); + const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1); + + const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(b))), voffset2)), vscale2); + const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(b))), voffset2)), vscale2); + const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(b))), voffset2)), vscale2); + const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(b))), voffset2)), vscale2); + + const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); + const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); + + const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1); + const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3); + const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb); + + svst1_u8(pg, output_ptr + x, res); + + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + input1, input2, output); + } +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
\ No newline at end of file diff --git a/src/cpu/kernels/add/sve/qasymm8_signed.cpp b/src/cpu/kernels/add/sve/qasymm8_signed.cpp new file mode 100644 index 0000000000..3b922c6c21 --- /dev/null +++ b/src/cpu/kernels/add/sve/qasymm8_signed.cpp @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE2) +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +void add_qasymm8_signed_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + + const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale); + const auto voffseto = svdup_n_f32(oq_info.offset); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + const auto all_true_pg = svptrue_b8(); + + const auto vscale1 = is_broadcast_input_2 ? svdup_n_f32(iq1_info.scale) : svdup_n_f32(iq2_info.scale); + const auto vscale2 = is_broadcast_input_2 ? svdup_n_f32(iq2_info.scale) : svdup_n_f32(iq1_info.scale); + const auto voffset1 = is_broadcast_input_2 ? svdup_n_s32(iq1_info.offset) : svdup_n_s32(iq2_info.offset); + const auto voffset2 = is_broadcast_input_2 ? svdup_n_s32(iq2_info.offset) : svdup_n_s32(iq1_info.offset); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr()); + const auto broadcast_value_vec = svdup_n_s8(broadcast_value); + + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2); + const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2); + const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2); + const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2); + + do + { + const auto a = svld1_s8(pg, non_broadcast_input_ptr + x); + const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1); + const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1); + const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1); + const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1); + + const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); + const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); + + const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); + const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3); + const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb); + + svst1_s8(pg, output_ptr + x, res); + + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + const auto vscale1 = svdup_n_f32(iq1_info.scale); + const auto vscale2 = svdup_n_f32(iq2_info.scale); + const auto voffset1 = svdup_n_s32(iq1_info.offset); + const auto voffset2 = svdup_n_s32(iq2_info.offset); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + const auto a = svld1_s8(pg, input1_ptr + x); + const auto b = svld1_s8(pg, input2_ptr + x); + + const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1); + const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1); + const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1); + const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1); + + const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(b)), voffset2)), vscale2); + const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(b)), voffset2)), vscale2); + const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(b)), voffset2)), vscale2); + const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(b)), voffset2)), vscale2); + + const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo)); + const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo)); + + const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); + const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3); + const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb); + + svst1_s8(pg, output_ptr + x, res); + + x += svcntb(); + pg = svwhilelt_b8(x, window_end_x); + } + while(svptest_any(svptrue_b8(), pg)); + }, + input1, input2, output); + } +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
\ No newline at end of file diff --git a/src/cpu/kernels/add/sve/qsymm16.cpp b/src/cpu/kernels/add/sve/qsymm16.cpp new file mode 100644 index 0000000000..eef5d245d3 --- /dev/null +++ b/src/cpu/kernels/add/sve/qsymm16.cpp @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE2) +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +void add_qsymm16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + + const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + const auto vscale1 = svdup_n_f32(iq1_info.scale); + const auto vscale2 = svdup_n_f32(iq2_info.scale); + const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale); + const auto all_true_pg = svptrue_b16(); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr()); + const auto broadcast_value_vec = svdup_n_s16(broadcast_value); + + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + + const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(broadcast_value_vec)), vscale2); + const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(broadcast_value_vec)), vscale2); + + do + { + const auto a = svld1_s16(pg, non_broadcast_input_ptr + x); + const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1); + const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1); + + const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + + const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); + + svst1_s16(pg, output_ptr + x, res); + + x += svcnth(); + pg = svwhilelt_b16(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do + { + auto a = svld1_s16(pg, input1_ptr + x); + auto b = svld1_s16(pg, input2_ptr + x); + + const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1); + const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1); + + const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(b)), vscale2); + const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(b)), vscale2); + + const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo)); + const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo)); + + const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1); + svst1_s16(pg, output_ptr + x, res); + + x += svcnth(); + pg = svwhilelt_b16(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + input1, input2, output); + } +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
\ No newline at end of file diff --git a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h new file mode 100644 index 0000000000..3b9a6b4760 --- /dev/null +++ b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H +#define ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H + +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "src/core/NEON/INEKernel.h" +#include "src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp" + +#include "gemm_common.hpp" + +namespace arm_compute +{ +class ITensor; + +namespace cpu +{ +namespace kernel +{ +/** This class is a wrapper for the assembly kernels. + * + * Some kernels were written in assembly and highly optimised for specific CPUs like A53 or A55. + * This class works as a wrapper for these assembly kernels. The arm compute library creates an instance + * of CpuGemmAssemblyWrapperKernel and other auxiliary data structures to execute a single assembly kernel + * in the context of an NEFunctions. + * + * The type T is the type of the actual kernel implemented in assembly which is of type + * template<typename To, typename Tr> class GemmCommon + * + * + */ +template <typename TypeInput, typename TypeOutput> +class CpuGemmAssemblyWrapperKernel final : public INEKernel +{ +public: + /** Constructor + */ + CpuGemmAssemblyWrapperKernel() + : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel") + { + } + + CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &) = delete; + CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&) = default; + CpuGemmAssemblyWrapperKernel &operator=(CpuGemmAssemblyWrapperKernel &) = delete; + + const char *name() const override + { + return _name.c_str(); + } + + void run(const Window &window, const ThreadInfo &info) override + { + ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel))); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + + auto win = arm_gemm::to_ndcoord(window); + + arm_gemm::ndcoord_t thread_locator{}; + + _kernel->execute(win, thread_locator, info.thread_id); + } + + // Inherited methods overridden: + void run_nd(const Window &window, const ThreadInfo &info, const Window &thread_locator) override + { + ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel))); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + + //convert between arm_compute and arm_gemm types + auto ndc_win = arm_gemm::to_ndcoord(window); + auto ndc_tlc = arm_gemm::to_ndcoord(thread_locator); + + _kernel->execute(ndc_win, ndc_tlc, info.thread_id); + } + + /** Initialise the kernel's input and output. + * + * @param[in] kernel Pointer to an assembly kernel implementation. + * @param[in] kernel_name_tag Tag to be attacehd to the kernel's name. + */ + void configure(arm_gemm::GemmCommon<TypeInput, TypeOutput> *kernel, std::string kernel_name_tag) + { + ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel))); + _kernel = kernel; + + Window win = to_window(kernel->get_window_size()); + + INEKernel::configure(win); + + if(!kernel_name_tag.empty()) + { + _name += "/" + kernel_name_tag; + } + } + +private: + arm_gemm::GemmCommon<TypeInput, TypeOutput> *_kernel; + std::string _name; +}; +} // namespace kernel +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H */ diff --git a/src/cpu/kernels/assembly/arm_gemm.hpp b/src/cpu/kernels/assembly/arm_gemm.hpp new file mode 100644 index 0000000000..e38cc09202 --- /dev/null +++ b/src/cpu/kernels/assembly/arm_gemm.hpp @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include <cstring> +#include <memory> +#include <vector> + +#include "arm_gemm_local.hpp" +#include "gemm_common.hpp" + +namespace arm_gemm +{ +enum class GemmMethod +{ + DEFAULT, + GEMV_BATCHED, + GEMV_PRETRANSPOSED, + GEMV_NATIVE_TRANSPOSED, + GEMM_NATIVE, + GEMM_HYBRID, + GEMM_INTERLEAVED, + GEMM_INTERLEAVED_2D, + QUANTIZE_WRAPPER, + QUANTIZE_WRAPPER_2D, + GEMM_HYBRID_QUANTIZED +}; + +struct KernelDescription +{ + GemmMethod method = GemmMethod::DEFAULT; + std::string name = ""; + bool is_default = false; + uint64_t cycle_estimate = 0; + + KernelDescription(GemmMethod m, std::string n, bool d = false, uint64_t c = 0) + : method(m), name(n), is_default(d), cycle_estimate(c) + { + } + KernelDescription() noexcept + { + } +}; + +struct GemmConfig +{ + GemmMethod method = GemmMethod::DEFAULT; + std::string filter = ""; + unsigned int inner_block_size = 0; + unsigned int outer_block_size = 0; + + GemmConfig(GemmMethod method) + : method(method) + { + } + GemmConfig() + { + } +}; + +struct Activation +{ + enum class Type + { + None, + ReLU, + BoundedReLU + }; + + Type type; + float param1; + float param2; + + Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f) + : type(type), param1(p1), param2(p2) + { + } +}; + +struct GemmArgs +{ +public: + const CPUInfo *_ci; + unsigned int _Msize; + unsigned int _Nsize; + unsigned int _Ksize; + unsigned int _Ksections; + unsigned int _nbatches; + unsigned int _nmulti; + bool _indirect_input; + Activation _act; + int _maxthreads; + bool _fast_mode; + const GemmConfig *_cfg; + + GemmArgs(const CPUInfo *ci, unsigned int M, unsigned int N, + unsigned int K, unsigned int Ksections, unsigned int nbatches, + unsigned int nmulti, bool indirect_input, Activation act, const int maxthreads, + bool fast_mode = false, const GemmConfig *cfg = nullptr) + : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _Ksections(Ksections), _nbatches(nbatches), _nmulti(nmulti), _indirect_input(indirect_input), _act(act), _maxthreads(maxthreads), _fast_mode(fast_mode), + _cfg(cfg) + { + } +}; + +struct Requantize32 +{ +public: + const int32_t *bias = nullptr; + size_t bias_multi_stride = 0; + int32_t a_offset = 0; + int32_t b_offset = 0; + int32_t c_offset = 0; + bool per_channel_requant = false; + int32_t per_layer_left_shift = 0; + int32_t per_layer_right_shift = 0; + int32_t per_layer_mul = 0; + const int32_t *per_channel_left_shifts = nullptr; + const int32_t *per_channel_right_shifts = nullptr; + const int32_t *per_channel_muls = nullptr; + int32_t minval = 0; + int32_t maxval = 0; + + Requantize32() = default; + + // Constructor for per-tensor quantization + Requantize32(const int32_t *bias, size_t bias_multi_stride, + int32_t a_offset, int32_t b_offset, int32_t c_offset, + int32_t requant_shift, int32_t requant_mul, int32_t minv, int32_t maxv) + : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_left_shift(std::max<int32_t>(requant_shift, 0)), + per_layer_right_shift(std::min<int32_t>(requant_shift, 0)), per_layer_mul(requant_mul), minval(minv), maxval(maxv) + { + } + + // Constructor for per-channel quantization + Requantize32(const int32_t *bias, size_t bias_multi_stride, + int32_t a_offset, int32_t b_offset, int32_t c_offset, + const int32_t *requant_left_shifts, + const int32_t *requant_right_shifts, + const int32_t *requant_muls, + int32_t minv, int32_t maxv) + : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(true), per_channel_left_shifts(requant_left_shifts), + per_channel_right_shifts(requant_right_shifts), per_channel_muls(requant_muls), minval(minv), maxval(maxv) + { + } +}; + +struct Nothing +{ +}; + +template <typename Top, typename Tret> +using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret>>; + +/* Low level API calls. + * These are implemented as 'GemmArgs' versions, or with the arguments explicitly listed. */ + +/* get_gemm_method(): Given the templated types and provided parameters, + * which is the preferred method to implement this GEMM? */ +template <typename Top, typename Tret, class OutputStage = Nothing> +KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage & = {}); + +template <typename Top, typename Tret, class OutputStage = Nothing> +UniqueGemmCommon<Top, Tret> gemm(const GemmArgs &args, const OutputStage & = {}); + +template <typename Top, typename Tret, class OutputStage = Nothing> +std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, const OutputStage & = {}); + +} // namespace arm_gemm diff --git a/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp b/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp new file mode 100644 index 0000000000..718fcd1fb4 --- /dev/null +++ b/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include "arm_compute/core/Dimensions.h" +#include "arm_compute/core/Window.h" + +#include "ndrange.hpp" + +#include <cassert> + +/* This file contains mapping between integral types used in arm_compute and arm_gemm + * These two codebases both require a degree of separation for the sake of modularity + * so maintain their own types which represent similar information. + */ + +namespace arm_gemm +{ +//we want to unify the maximum number of dimensions used beween arm_gemm and arm compute library +constexpr std::size_t ndrange_max = + arm_compute::Dimensions<unsigned int>::num_max_dimensions; + +using ndrange_t = NDRange<ndrange_max>; +using ndcoord_t = NDCoordinate<ndrange_max>; + +/* Converts an `arm_gemm::ndrange_t` to a `arm_compute::Window` + * + * As `NDRange<T>` does not not encode start positions, we specify + * the start to be zero in the produced `arm_compute::Window` + * + * @param [ndr] the `arm_gemm::ndrange_t` we wish to convert into a `arm_compute::Window` + * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndr` + */ +inline arm_compute::Window to_window(const ndrange_t &ndr) +{ + arm_compute::Window win; + + for(unsigned int i = 0; i != ndrange_max; ++i) + { + //populate the window with the dimensions of the NDRange + win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i))); + } + + return win; +} + +/* + * Converts an `arm_gemm::ndcoord_t` to a `arm_compute::Window` + * + * @param [ndc] the `arm_gemm::ndcoord_t` we wish to convert into a `arm_compute::Window` + * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndc` + */ +inline arm_compute::Window to_window(const ndcoord_t &ndc) +{ + arm_compute::Window win; + + for(unsigned int i = 0; i != ndrange_max; ++i) + { + const auto start = ndc.get_position(i); + const auto size = ndc.get_size(i); + const auto stop = start + size; + + //populate the window with the dimensions of the NDRange + win.set(i, arm_compute::Window::Dimension(start, stop)); + } + + return win; +} + +/** Convert an `arm_compute::Window` to an `arm_gemm::NDRange` of the same max dimensions + * + * It should be noted that `arm_compute::Window` specifies a `start()` and an `end()` + * where as `arm_gemm::ndrange_t` only has a size, as a result we store the delta between the range + * + * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndrange_t` + * @return the resultant ndrange_t + */ +inline ndrange_t to_ndrange(const arm_compute::Window &win) +{ + return + { + static_cast<unsigned int>(win[0].end() - win[0].start()), + static_cast<unsigned int>(win[1].end() - win[1].start()), + static_cast<unsigned int>(win[2].end() - win[2].start()), + static_cast<unsigned int>(win[3].end() - win[3].start()), + static_cast<unsigned int>(win[4].end() - win[4].start()), + static_cast<unsigned int>(win[5].end() - win[5].start()) + }; +} + +/** Convert an `arm_compute::Window` to an `arm_gemm::NDCoord` of the same max dimensions + * + * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndcoord_t` + * @return the resultant ndcoord_t + */ +inline ndcoord_t to_ndcoord(const arm_compute::Window &win) +{ + return + { + { static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start()) }, + { static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start()) }, + { static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start()) }, + { static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start()) }, + { static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start()) }, + { static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start()) } + }; +} + +} //namespace arm_gemm diff --git a/src/cpu/kernels/assembly/arm_gemm_local.hpp b/src/cpu/kernels/assembly/arm_gemm_local.hpp new file mode 100644 index 0000000000..78e0adf31f --- /dev/null +++ b/src/cpu/kernels/assembly/arm_gemm_local.hpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +/* This file is used to configure integration-specific aspects of arm_gemm into ACL */ + +#include "arm_compute/core/CPP/CPPTypes.h" + +using CPUModel = arm_compute::CPUModel; +using CPUInfo = arm_compute::CPUInfo; diff --git a/src/cpu/kernels/assembly/convolution_parameters.hpp b/src/cpu/kernels/assembly/convolution_parameters.hpp new file mode 100644 index 0000000000..0c1ae58902 --- /dev/null +++ b/src/cpu/kernels/assembly/convolution_parameters.hpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include <cstdint> + +namespace arm_gemm +{ +/* + * Parameter set for "convolution" type GEMM. + * + * For a "convolution" GEMM, the GEMM parameters (M, K) are specified as if + * an im2row had been performed on the input tensor to generate the operand + * matrix, but instead this structure describes the convolution parameters + * such that this can be done on the fly. + * + * The parameters describe the convolution details - the notional shape of + * the input and output tensors, whether padding is to be applied, the size + * of the kernel and a constant value to be used for padding (needed for + * quantized tensors). + * + * The second part describes the layout of the input tensor in memory, which + * is assumed to be in NHWC format. This consists of a base pointer and + * strides for columns, rows and batches. 'multis' are not supported for + * convolution type GEMMs. + */ +struct ConvolutionParameters +{ + int64_t input_width; + int64_t input_height; + int64_t input_channels; + int64_t kernel_width; + int64_t kernel_height; + int64_t output_width; + int64_t output_height; + int64_t output_stride_w; + int64_t output_stride_h; + // output_channels not included as they do not affect the input. + int64_t padding_top; + int64_t padding_left; + float padding_value; +}; + +} // namespace arm_gemm diff --git a/src/cpu/kernels/assembly/gemm_common.hpp b/src/cpu/kernels/assembly/gemm_common.hpp new file mode 100644 index 0000000000..378f1041be --- /dev/null +++ b/src/cpu/kernels/assembly/gemm_common.hpp @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include "convolution_parameters.hpp" +#include "ndrange.hpp" + +#include <cstddef> + +namespace arm_gemm +{ +// Avoid circular dependency with arm_gemm.hpp +struct GemmConfig; + +// Abstract class for the GEMM/GEMV functions. +// +// GEMM implementations may be "native" (never require any input +// permutation), "pretransposed" (require permutation up-front) or require +// working space (permute as they go along). This interface should support +// all of them. + +// The real GemmCommon class is templated based on the operand and return +// type. This is an interface class which is independent of those types. +class IGemmCommon +{ +public: + /* Pass in the pointers to the arrays to be operated on and their + * strides. This "generic" version uses void *s, the preferred version + * is the one provided by templated GemmCommon (below) which takes + * appropriately typed pointers. If B is pretransposed (see below) then + * the settings for B here are ignored. + */ + virtual void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride, + const void *B, const int ldb, /* batches share B */ const int B_multi_stride, + void *C, const int ldc, const int C_batch_stride, const int C_multi_stride, + const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0; + + /** @returns an ndrange containing ranges of the compute space which can be + * broken up and parallelised over + */ + virtual ndrange_t get_window_size() const = 0; + + /* The maximum thread count is specified when the GEMM is created. Some + * implementations need to know how many threads will actually run in + * order to work properly. + * + * In some cases, after creating the GEMM the number of threads needs to + * be reduced (e.g. not enough work to split across threads). This + * method allows the number of actual threads to be run to be set (must + * be equal or lower). + * + * This has an empty default implementation, as GEMMs which don't care + * about thread count can safely ignore this. + */ + virtual void set_nthreads(int) {}; + + /* Whether this GEMM can be dynamically scheduled or not. */ + virtual bool supports_dynamic_scheduling() const + { + return false; + } + + /** Main execute member fucntion + * @param [in] work_range specifies the range of work we want to be computed, total range defined by get_window_size() + * @param [in] thread_locator where are we inside of the thread space + * @param [in] threadid a unique threadid + */ + virtual void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) = 0; + + /*** Working space interface (optional) ***/ + /* Total number of bytes of temporary working space needed. If zero, it's not necessary to call set_working_space(). */ + virtual size_t get_working_size() const + { + return 0; + } + /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */ + virtual void set_working_space(void *) {}; + + /*** "Pretransposed" interface (optional) ***/ + /* Is this object set up for pretranspose? If so, pretranspose_array() needs to be called before execute(); */ + virtual bool B_is_pretransposed() const + { + return false; + } + /* Does pretranspose still need to be done? */ + virtual bool B_pretranspose_required() const + { + return false; + } + /* Total number of bytes of space needed for pretransposed arrays. */ + virtual size_t get_B_pretransposed_array_size() const + { + return 0; + } + /* Perform pretranspose - arguments are output, input, input row stride and input multi stride. */ + /* The "real" version of this depends on the templated operand type (see below). */ + virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0; + /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */ + virtual void set_pretransposed_B_data(void *) + { + } + + /*** "Quantized bias" interface (optional) ***/ + /* Set the bias vector for quantized GEMMs */ + virtual void set_quantized_bias(const int32_t *, size_t) + { + } + + /*** Indirect interface (optional) ***/ + /* Set the indirect table. This comprises a number of values per kernel point, and a densely packed array of pointers, + * multis * batches * kernel_points */ + virtual void set_indirect_parameters_generic(size_t, const void *const *const *) + { + } + + /*** Convolution interface (optional) ***/ + /* Set the convolution parameters. */ + virtual void set_convolution_parameters(ConvolutionParameters) + { + } + + /*** Introspection interface ***/ + /* Get the configuration of this GEMM */ + virtual GemmConfig get_config() = 0; + + // Destructor + virtual ~IGemmCommon() + { + } +}; + +/* "Real" GemmCommon class which is templated on the operand and return types. + * + * In addition to correctly typed versions of the functions that operate on + * operand and return data, this class provides a default implementation of + * 'set_arrays' to capture the provided arguments in protected class + * members, as essentially any implementation will need these. + */ +template <typename To, typename Tr> +class GemmCommon : public IGemmCommon +{ +protected: + const To *_Aptr = nullptr; + int _lda = 0; + int _A_batch_stride = 0; + int _A_multi_stride = 0; + const To *_Bptr = nullptr; + int _ldb = 0; + int _B_multi_stride = 0; + Tr *_Cptr = nullptr; + int _ldc = 0; + int _C_batch_stride = 0; + int _C_multi_stride = 0; + const Tr *_bias = nullptr; + int _bias_multi_stride = 0; + +public: + /* Pass in the pointers to the arrays to be operated on and their + * strides (templated version with appropriate types). */ + virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride, + const To *B, const int ldb, /* batches share B */ const int B_multi_stride, + Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride, + const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride) + { + _Aptr = A; + _lda = lda; + _A_batch_stride = A_batch_stride; + _A_multi_stride = A_multi_stride; + _Bptr = B; + _ldb = ldb; + _B_multi_stride = B_multi_stride; + _Cptr = C; + _ldc = ldc; + _C_batch_stride = C_batch_stride; + _C_multi_stride = C_multi_stride; + _bias = bias; + _bias_multi_stride = bias_multi_stride; + } + + /* Implementation of the void * overload which casts its arguments to the appropriate type. */ + void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride, + const void *B, const int ldb, /* batches share B */ const int B_multi_stride, + void *C, const int ldc, const int C_batch_stride, const int C_multi_stride, + const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override + { + set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride, + static_cast<const To *>(B), ldb, B_multi_stride, + static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride, + static_cast<const Tr *>(bias), bias_multi_stride); + } + + /*** "Pretransposed" interface ***/ + + /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */ + /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */ + virtual void pretranspose_B_array(void *, const To *, const int, const int) {}; + + /* Implementation of the void * overload which casts its arguments to the appropriate type. */ + void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override + { + pretranspose_B_array(out, static_cast<const To *>(in), row_stride, multi_stride); + } + + /*** Indirect interface ***/ + virtual void set_indirect_parameters(size_t, const To *const *const *) + { + } + + void set_indirect_parameters_generic(size_t sz, const void *const *const *ptr) override + { + set_indirect_parameters(sz, reinterpret_cast<const To *const *const *>(ptr)); + } +}; + +} // namespace arm_gemm diff --git a/src/cpu/kernels/assembly/ndrange.hpp b/src/cpu/kernels/assembly/ndrange.hpp new file mode 100644 index 0000000000..1c8261aef7 --- /dev/null +++ b/src/cpu/kernels/assembly/ndrange.hpp @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include <algorithm> +#include <array> +#include <cassert> +#include <initializer_list> + +namespace arm_gemm +{ +template <unsigned int D> +class NDRange +{ +private: + std::array<unsigned int, D> m_sizes{}; + std::array<unsigned int, D> m_totalsizes{}; + + class NDRangeIterator + { + private: + const NDRange &m_parent; + unsigned int m_pos = 0; + unsigned int m_end = 0; + + public: + NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) + : m_parent(p), m_pos(s), m_end(e) + { + } + + bool done() const + { + return (m_pos >= m_end); + } + + unsigned int dim(unsigned int d) const + { + unsigned int r = m_pos; + + if(d < (D - 1)) + { + r %= m_parent.m_totalsizes[d]; + } + + if(d > 0) + { + r /= m_parent.m_totalsizes[d - 1]; + } + + return r; + } + + bool next_dim0() + { + m_pos++; + + return !done(); + } + + bool next_dim1() + { + m_pos += m_parent.m_sizes[0] - dim(0); + + return !done(); + } + + unsigned int dim0_max() const + { + unsigned int offset = std::min(m_end - m_pos, m_parent.m_sizes[0] - dim(0)); + + return dim(0) + offset; + } + }; + + void set_totalsizes() + { + unsigned int t = 1; + + for(unsigned int i = 0; i < D; i++) + { + if(m_sizes[i] == 0) + { + m_sizes[i] = 1; + } + + t *= m_sizes[i]; + + m_totalsizes[i] = t; + } + } + +public: + NDRange &operator=(const NDRange &rhs) = default; + NDRange(const NDRange &rhs) = default; + + template <typename... T> + NDRange(T... ts) + : m_sizes{ ts... } + { + set_totalsizes(); + } + + NDRange(const std::array<unsigned int, D> &n) + : m_sizes(n) + { + set_totalsizes(); + } + + NDRangeIterator iterator(unsigned int start, unsigned int end) const + { + return NDRangeIterator(*this, start, end); + } + + unsigned int total_size() const + { + return m_totalsizes[D - 1]; + } + + unsigned int get_size(unsigned int v) const + { + return m_sizes[v]; + } +}; + +/** NDCoordinate builds upon a range, but specifies a starting position + * in addition to a size which it inherits from NDRange + */ +template <unsigned int N> +class NDCoordinate : public NDRange<N> +{ + using int_t = unsigned int; + using ndrange_t = NDRange<N>; + + std::array<int_t, N> m_positions{}; + +public: + NDCoordinate &operator=(const NDCoordinate &rhs) = default; + NDCoordinate(const NDCoordinate &rhs) = default; + NDCoordinate(const std::initializer_list<std::pair<int_t, int_t>> &list) + { + std::array<int_t, N> sizes{}; + + std::size_t i = 0; + for(auto &p : list) + { + m_positions[i] = p.first; + sizes[i++] = p.second; + } + + //update the parents sizes + static_cast<ndrange_t &>(*this) = ndrange_t(sizes); + } + + int_t get_position(int_t d) const + { + assert(d < N); + + return m_positions[d]; + } + + void set_position(int_t d, int_t v) + { + assert(d < N); + + m_positions[d] = v; + } + + int_t get_position_end(int_t d) const + { + return get_position(d) + ndrange_t::get_size(d); + } +}; //class NDCoordinate + +using ndrange_t = NDRange<6>; +using ndcoord_t = NDCoordinate<6>; + +} // namespace arm_gemm diff --git a/src/cpu/kernels/elementwise/neon/elementwise_list.h b/src/cpu/kernels/elementwise/neon/elementwise_list.h new file mode 100644 index 0000000000..43e44be5e2 --- /dev/null +++ b/src/cpu/kernels/elementwise/neon/elementwise_list.h @@ -0,0 +1,486 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H +#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H + +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename InputScalarType, typename OutputScalarType, typename InputVectorType> +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &), + int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool), + int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *)) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2); + for(; x < window_end_x; ++x) + { + const auto a = *(non_broadcast_input_ptr + x); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr); + for(; x < window_end_x; ++x) + { + const auto a = *(input1_ptr + x); + const auto b = *(input2_ptr + x); + *(output_ptr + x) = (*scalar_func)(a, b); + } + }, + input1, input2, output); + } +} + +template <ArithmeticOperation op, typename ScalarType> +inline ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const ScalarType &b) +{ + auto res = ScalarType(0); + + switch(op) + { + case ArithmeticOperation::MAX: + res = std::max(a, b); + break; + case ArithmeticOperation::MIN: + res = std::min(a, b); + break; + case ArithmeticOperation::SQUARED_DIFF: + { + res = (a - b) * (a - b); + break; + } + case ArithmeticOperation::PRELU: + { + res = (a > 0 ? a : a * b); + break; + } + case ArithmeticOperation::DIV: + { + res = a / b; + if(std::is_integral<ScalarType>::value) + { + res = (b == 0) ? 0 : res; + if(static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0))) + { + --res; + } + } + break; + } + case ArithmeticOperation::POWER: + { + res = std::pow(a, b); + break; + } + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res; +} + +template <ArithmeticOperation op, typename VectorType> +inline typename VectorType::type elementwise_arithm_op(const typename VectorType::type &a, const typename VectorType::type &b) +{ + using vec_type = typename VectorType::type; + using scalar_type = typename VectorType::scalar_type; + using tag_type = typename VectorType::tag_type; + + vec_type res = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{}); + + switch(op) + { + case ArithmeticOperation::MAX: + res = wrapper::vmax(a, b); + break; + case ArithmeticOperation::MIN: + res = wrapper::vmin(a, b); + break; + case ArithmeticOperation::SQUARED_DIFF: + { + const vec_type tmp = wrapper::vsub(a, b); + res = wrapper::vmul(tmp, tmp); + break; + } + case ArithmeticOperation::PRELU: + { + const vec_type zero = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{}); + const vec_type tmp = wrapper::vmul(a, b); + const auto gt = wrapper::vcgt(a, zero); + + res = wrapper::vbsl(gt, a, tmp); + break; + } + + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + + return res; +} + +template <> +inline int32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a, const int32x4_t &b) +{ + return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b)))); +} + +template <> +inline float32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b) +{ + return wrapper::vdiv(a, b); +} + +template <> +inline float32x4_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b) +{ + return wrapper::vpow(a, b); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> +inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b) +{ + return wrapper::vdiv(a, b); +} + +template <> +inline float16x8_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b) +{ + return wrapper::vpow(a, b); +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +template <ArithmeticOperation op, typename ScalarType, typename VectorType> +inline typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder) +{ + using tag_type = typename VectorType::tag_type; + using vec_type = typename VectorType::type; + + vec_type broadcast_vector = wrapper::vdup_n(broadcast_value, tag_type{}); + return elementwise_arithm_op<op, VectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector); +} + +template <ArithmeticOperation op, typename ScalarType, typename VectorType> +inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int window_step_x, + const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + wrapper::vstore(output_ptr + x, elementwise_arithm_op<op, VectorType>(a, b)); + } + return x; +} + +template <ArithmeticOperation op, typename ScalarType, typename VectorType> +inline int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, + const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq((non_broadcast_input_ptr + x)); + wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder)); + } + return x; +} + +template <ArithmeticOperation op, typename VectorType> +void elementwise_arithm_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + using scalar_type = typename VectorType::scalar_type; + + elementwise_op<scalar_type, scalar_type, VectorType>(in1, in2, out, window, + &elementwise_arithm_op_scalar<op, scalar_type>, + &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>, + &elementwise_arithm_op_loop<op, scalar_type, VectorType>); +} + +template <ComparisonOperation op, typename InputScalarType> +inline uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputScalarType &b) +{ + bool res = false; + + switch(op) + { + case ComparisonOperation::Equal: + res = (a == b); + break; + case ComparisonOperation::NotEqual: + res = (a != b); + break; + case ComparisonOperation::Greater: + res = (a > b); + break; + case ComparisonOperation::GreaterEqual: + res = (a >= b); + break; + case ComparisonOperation::Less: + res = (a < b); + break; + case ComparisonOperation::LessEqual: + res = (a <= b); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + return res ? ~static_cast<uint8_t>(0) : static_cast<uint8_t>(0); +} + +template <ComparisonOperation op, typename InputVectorType, typename OutputVectorType> +inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b) +{ + OutputVectorType res = { 0, 0, 0, 0 }; + + switch(op) + { + case ComparisonOperation::Equal: + res = wrapper::vceq(a, b); + break; + case ComparisonOperation::NotEqual: + res = wrapper::vnot(wrapper::vceq(a, b)); + break; + case ComparisonOperation::Greater: + res = wrapper::vcgt(a, b); + break; + case ComparisonOperation::GreaterEqual: + res = wrapper::vcge(a, b); + break; + case ComparisonOperation::Less: + res = wrapper::vcgt(b, a); + break; + case ComparisonOperation::LessEqual: + res = wrapper::vcge(b, a); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + + return res; +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType, typename OutputVectorType> +inline OutputVectorType elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder) +{ + InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag()); + return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector); +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, int window_end_x, int window_step_x, + const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); + wrapper::vstore(output_ptr + x, a); + } + return x; +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x, + const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); + wrapper::vstore(output_ptr + x, wrapper::vmovn(a)); + } + return x; +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_end_x, int window_step_x, + const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder); + const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder); + wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b)))); + } + if(x <= window_end_x - 4) + { + const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder); + for(int i = 0; i < 4; i++) + { + *(output_ptr + x + i) = wrapper::vgetlane(a, i); + } + x = +4; + } + return x; +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int window_step_x, + const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + const auto res = elementwise_comp_op<op, InputVectorType, uint8x16_t>(a, b); + wrapper::vstore(output_ptr + x, res); + } + return x; +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x, + const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + const auto res = elementwise_comp_op<op, InputVectorType, uint16x8_t>(a, b); + wrapper::vstore(output_ptr + x, wrapper::vmovn(res)); + } + return x; +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int window_step_x, + const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + auto a = wrapper::vloadq(input1_ptr + x); + auto b = wrapper::vloadq(input2_ptr + x); + const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b); + a = wrapper::vloadq(input1_ptr + x + 4); + b = wrapper::vloadq(input2_ptr + x + 4); + const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b); + wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2)))); + } + if(x <= window_end_x - 4) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b); + for(int i = 0; i < 4; i++) + { + *(output_ptr + x + i) = wrapper::vgetlane(res, i); + } + x = +4; + } + return x; +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window, + &elementwise_comp_op_scalar<op, InputScalarType>, + &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>, + &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>); +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window, + &elementwise_comp_op_scalar<op, InputScalarType>, + &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>, + &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>); +} + +template <ComparisonOperation op, typename InputScalarType, typename InputVectorType> +void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window, + &elementwise_comp_op_scalar<op, InputScalarType>, + &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>, + &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>); +} +} // namesapce cpu +} // namespace arm_compute + +#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H */
\ No newline at end of file diff --git a/src/cpu/kernels/elementwise/neon/elementwise_quantized_list.h b/src/cpu/kernels/elementwise/neon/elementwise_quantized_list.h new file mode 100644 index 0000000000..3b4c112770 --- /dev/null +++ b/src/cpu/kernels/elementwise/neon/elementwise_quantized_list.h @@ -0,0 +1,654 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H +#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H + +#include "src/cpu/kernels/elementwise/neon/elementwise_list.h" + +namespace arm_compute +{ +namespace cpu +{ +float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale) +{ + qasymm8x16_t x = vld1q_u8(input1_ptr); + const float32x4x4_t out = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale), + } + }; + return out; +} + +float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale) +{ + qasymm8x16_signed_t x = vld1q_s8(input1_ptr); + const float32x4x4_t out = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale), + } + }; + return out; +} + +void store_quantized(uint8_t *output_ptr, const uint32x4x4_t &out) +{ + const uint8x8_t pa = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[0]), vqmovn_u32(out.val[1]))); + const uint8x8_t pb = vqmovn_u16(vcombine_u16(vqmovn_u32(out.val[2]), vqmovn_u32(out.val[3]))); + vst1q_u8(output_ptr, vcombine_u8(pa, pb)); +} + +void store_quantized(uint8_t *output_ptr, const int32x4x4_t &out) +{ + const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1]))); + const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3]))); + vst1q_u8(output_ptr, vcombine_u8(pa, pb)); +} + +void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) +{ + int32x4x4_t out = + { + { + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), + } + }; + store_quantized(output_ptr, out); +} + +void store_quantized_signed(int8_t *output_ptr, const int32x4x4_t &out) +{ + const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[0]), vqmovn_s32(out.val[1]))); + const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(out.val[2]), vqmovn_s32(out.val[3]))); + vst1q_s8(output_ptr, vcombine_s8(pa, pb)); +} + +void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale) +{ + int32x4x4_t out = + { + { + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)), + vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)), + } + }; + store_quantized_signed(output_ptr, out); +} + +template <ArithmeticOperation op> +inline uint8_t elementwise_arithm_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) +{ + return quantize_qasymm8(elementwise_arithm_op_scalar<op>(a, b), qinfo); +} + +template <ArithmeticOperation op> +inline int8_t elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) +{ + return quantize_qasymm8_signed(elementwise_arithm_op_scalar<op>(a, b), qinfo); +} + +template <ArithmeticOperation op> +inline float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b) +{ + using neon_vector_float = wrapper::traits::neon_vector<float, 4>; + float32x4x4_t out = + { + { + elementwise_arithm_op<op, neon_vector_float>(a.val[0], b.val[0]), + elementwise_arithm_op<op, neon_vector_float>(a.val[1], b.val[1]), + elementwise_arithm_op<op, neon_vector_float>(a.val[2], b.val[2]), + elementwise_arithm_op<op, neon_vector_float>(a.val[3], b.val[3]), + } + }; + return out; +} + +template <ComparisonOperation op> +inline uint8_t elementwise_comp_op_quantized_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo) +{ + ARM_COMPUTE_UNUSED(qinfo); + return elementwise_comp_op_scalar<op>(a, b); +} + +template <ComparisonOperation op> +inline uint32x4x4_t elementwise_comp_op(const float32x4x4_t &a, const float32x4x4_t &b) +{ + uint32x4x4_t out = + { + { + elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[0], b.val[0]), + elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[1], b.val[1]), + elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[2], b.val[2]), + elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[3], b.val[3]) + } + }; + return out; +} + +template <ArithmeticOperation op> +inline int elementwise_arithm_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x, + const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr, + int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, + float32x4_t voffseto, float32x4_t invvscaleo) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Get inputs and compute output + const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1); + const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2); + const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf); + store_quantized(output_ptr + x, rf, voffseto, invvscaleo); + } + return x; +} + +template <ArithmeticOperation op> +inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, int window_end_x, int window_step_x, + const int8_t *input1_ptr, const int8_t *input2_ptr, int8_t *output_ptr, + int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, + float32x4_t voffseto, float32x4_t invvscaleo) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + // Get inputs and compute output + const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1); + const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2); + const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf); + store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo); + } + return x; +} + +template <ArithmeticOperation op> +inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, + const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, + int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, + float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); + store_quantized(output_ptr + x, rf, voffseto, invvscaleo); + } + return x; +} +template <ArithmeticOperation op> +inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, + const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, int8_t *output_ptr, + int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, + float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) +{ + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); + store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo); + } + return x; +} + +template <ComparisonOperation op> +inline int elementwise_comp_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x, + const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr, + int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, + float32x4_t voffseto, float32x4_t invvscaleo) +{ + ARM_COMPUTE_UNUSED(voffseto, invvscaleo); + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1); + const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2); + const uint32x4x4_t rf = elementwise_comp_op<op>(af, bf); + store_quantized(output_ptr + x, rf); + } + return x; +} + +template <ComparisonOperation op> +inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, int window_end_x, int window_step_x, + const int8_t *input1_ptr, const int8_t *input2_ptr, uint8_t *output_ptr, + int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, + float32x4_t voffseto, float32x4_t invvscaleo) +{ + ARM_COMPUTE_UNUSED(voffseto, invvscaleo); + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1); + const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2); + const uint32x4x4_t rf = elementwise_comp_op<op>(af, bf); + store_quantized(output_ptr + x, rf); + } + return x; +} + +template <ComparisonOperation op> +inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, + const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, + int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, + float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) +{ + ARM_COMPUTE_UNUSED(voffseto, invvscaleo); + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const uint32x4x4_t rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); + store_quantized(output_ptr + x, rf); + } + return x; +} + +template <ComparisonOperation op> +inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, + const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, + int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, + float32x4_t voffseto, float32x4_t invvscaleo, bool reorder) +{ + ARM_COMPUTE_UNUSED(voffseto, invvscaleo); + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast); + const uint32x4x4_t rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector); + store_quantized(output_ptr + x, rf); + } + return x; +} + +void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), + int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, + float32x4_t, float32x4_t, const bool), + int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, + int32x4_t, int32x4_t, float32x4_t, float32x4_t, + float32x4_t, float32x4_t)) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform(); + + // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from zero) + const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset + 0.5f); + const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); + + if(is_broadcast_across_x) + { + // Select the broadcast input on the X axis + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + + const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); + const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr()); + const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr, + voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2); + for(; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); + const float bfs = dequantize_qasymm8(broadcast_value, broadcast_qinfo); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform(); + const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform(); + + // Input1 quantization info + const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset); + const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale); + + // Input2 quantization info + const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset); + const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2, + vscale1, vscale2, voffseto, invvscaleo); + for(; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo); + const float bfs = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo); + *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); + } + }, + input1, input2, output); + } +} + +void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), + int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, + float32x4_t, float32x4_t, const bool), + int (*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *, + int32x4_t, int32x4_t, float32x4_t, float32x4_t, + float32x4_t, float32x4_t)) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform(); + + const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset); + const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); + + if(is_broadcast_across_x) + { + // Select the broadcast input on the X axis + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + + const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); + const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr()); + const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr, + voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2); + for(; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); + const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform(); + const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform(); + + // Input1 quantization info + const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset); + const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale); + + // Input2 quantization info + const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset); + const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2, + vscale1, vscale2, voffseto, invvscaleo); + for(; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo); + const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo); + *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); + } + }, + input1, input2, output); + } +} + +void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo), + int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t, + float32x4_t, float32x4_t, const bool), + int (*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *, + int32x4_t, int32x4_t, float32x4_t, float32x4_t, + float32x4_t, float32x4_t)) +{ + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform(); + + const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset); + const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale); + + if(is_broadcast_across_x) + { + // Select the broadcast input on the X axis + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + + const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset); + const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr()); + const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo); + + int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr, + voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2); + for(; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo); + const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo); + *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform(); + const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform(); + + // Input1 quantization info + const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset); + const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale); + + // Input2 quantization info + const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset); + const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale); + + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2, + vscale1, vscale2, voffseto, invvscaleo); + for(; x < window_end_x; ++x) + { + const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo); + const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo); + *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo); + } + }, + input1, input2, output); + } +} + +template <ArithmeticOperation op> +void elementwise_arithm_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_op_quantized(in1, in2, out, window, &elementwise_arithm_op_quantized_scalar<op>, + &elementwise_arithm_op_quantized_broadcast_loop<op>, + &elementwise_arithm_op_quantized_loop<op>); +} +template <ArithmeticOperation op> +void elementwise_arithm_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_op_quantized_signed(in1, in2, out, window, &elementwise_arithm_op_quantized_signed_scalar<op>, + &elementwise_arithm_op_quantized_signed_broadcast_loop<op>, + &elementwise_arithm_op_quantized_singed_loop<op>); +} + +template <ComparisonOperation op> +void elementwise_comp_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_op_quantized(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>, + &elementwise_comp_op_quantized_broadcast_loop<op>, + &elementwise_comp_op_quantized_loop<op>); +} + +template <ComparisonOperation op> +void elementwise_comp_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + elementwise_comp_quantized_signed(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>, + &elementwise_comp_op_quantized_signed_broadcast_loop<op>, + &elementwise_comp_op_quantized_signed_loop<op>); +} +} // namespace cpu +} // namespace arm_compute + +#endif /* SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */ diff --git a/src/cpu/kernels/elementwise/neon/elementwise_unary_list.h b/src/cpu/kernels/elementwise/neon/elementwise_unary_list.h new file mode 100644 index 0000000000..307e95fae9 --- /dev/null +++ b/src/cpu/kernels/elementwise/neon/elementwise_unary_list.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H +#define SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H + +#include "arm_compute/core/Types.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename ScalarType> +inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarType &a) +{ + switch(op) + { + case ElementWiseUnary::RSQRT: + return 1 / sqrt(a); + case ElementWiseUnary::EXP: + return std::exp(a); + case ElementWiseUnary::NEG: + return -a; + case ElementWiseUnary::LOG: + return std::log(a); + case ElementWiseUnary::ABS: + return std::abs(a); + case ElementWiseUnary::ROUND: + return support::cpp11::nearbyint(a); + case ElementWiseUnary::SIN: + return std::sin(a); + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } +} + +template <typename ScalarType, typename VectorType> +inline VectorType elementwise_op_imp(ElementWiseUnary op, const VectorType &a) +{ + switch(op) + { + case ElementWiseUnary::RSQRT: + return wrapper::vinvsqrt(a); + case ElementWiseUnary::EXP: + return wrapper::vexpq(a); + case ElementWiseUnary::NEG: + return wrapper::vneg(a); + case ElementWiseUnary::LOG: + return wrapper::vlog(a); + case ElementWiseUnary::ABS: + return wrapper::vabs(a); + case ElementWiseUnary::ROUND: + return wrapper::vround(a); + case ElementWiseUnary::SIN: + return wrapper::vsin(a); + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } +} + +template <typename ScalarType> +void elementwise_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) +{ + const int window_step_x = 16 / sizeof(ScalarType); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(in, win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); + + int x = window_start_x; + for(; x <= window_end_x - window_step_x; x += window_step_x) + { + wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x))); + } + for(; x < window_end_x; ++x) + { + *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x)); + } + }, + input, output); +} + +} // namespace cpu +} // namespace arm_compute + +#endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
\ No newline at end of file diff --git a/src/cpu/kernels/elementwise/sve/elementwise.cpp b/src/cpu/kernels/elementwise/sve/elementwise.cpp new file mode 100644 index 0000000000..2f9a7998df --- /dev/null +++ b/src/cpu/kernels/elementwise/sve/elementwise.cpp @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_SVE) +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "src/cpu/kernels/elementwise/sve/elementwise_list.h" +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +using namespace arm_compute::wrapper; + +template <typename InputScalarType, typename OutputScalarType, typename OperatorType> +struct LoopArguments +{ + OperatorType op; + const InputScalarType *input1_ptr; + const InputScalarType *input2_ptr; + OutputScalarType *output_ptr; +}; + +template <typename InputScalarType, typename OutputScalarType, typename OperatorType> +struct BroadcastLoopArguments +{ + OperatorType op; + const InputScalarType *input1_ptr; + InputScalarType broadcast_value; + OutputScalarType *output_ptr; + bool reorder; +}; + +template <typename InputScalarType, typename OutputScalarType> +void arithmetic_op_loop(svbool_t pg, const LoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args) +{ + const auto in1 = svld1(pg, args.input1_ptr); + const auto in2 = svld1(pg, args.input2_ptr); + const auto res = elementwise_arithmetic_op<typename sve_vector<InputScalarType>::type>(pg, in1, in2, args.op); + svst1(pg, args.output_ptr, res); +} + +template <typename InputScalarType, typename OutputScalarType> +void arithmetic_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args) +{ + const auto non_broadcast_vector = svld1(pg, args.input1_ptr); + const auto broadcast_vector = svdup_n(args.broadcast_value); + const auto in1 = args.reorder ? broadcast_vector : non_broadcast_vector; + const auto in2 = args.reorder ? non_broadcast_vector : broadcast_vector; + const auto res = elementwise_arithmetic_op<typename sve_vector<InputScalarType>::type>(pg, in1, in2, args.op); + svst1(pg, args.output_ptr, res); +} + +template <typename InputScalarType, typename OutputScalarType> +void comparison_op_loop(svbool_t pg, const LoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args) +{ + const auto in1 = svld1(pg, args.input1_ptr); + const auto in2 = svld1(pg, args.input2_ptr); + const auto res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, args.op); + const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg); + svst1(output_pg, args.output_ptr, res); +} + +template <typename InputScalarType, typename OutputScalarType> +void comparison_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args) +{ + const auto non_broadcast_vector = svld1(pg, args.input1_ptr); + const auto broadcast_vector = svdup_n(args.broadcast_value); + const auto in1 = args.reorder ? broadcast_vector : non_broadcast_vector; + const auto in2 = args.reorder ? non_broadcast_vector : broadcast_vector; + const auto res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, args.op); + const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg); + svst1(output_pg, args.output_ptr, res); +} + +template <typename InputScalarType, typename OutputScalarType, typename OperatorType> +using LoopFuncType = void (*)(svbool_t, const LoopArguments<InputScalarType, OutputScalarType, OperatorType> &); + +template <typename InputScalarType, typename OutputScalarType, typename OperatorType> +using BroadcastLoopFuncType = void (*)(svbool_t, const BroadcastLoopArguments<InputScalarType, OutputScalarType, OperatorType> &); + +template <typename InputVectorType, typename OutputVectorType, typename OperatorType, + typename InputScalarType = typename sve_scalar<InputVectorType>::type, + typename OutputScalarType = typename sve_scalar<OutputVectorType>::type> +void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + OperatorType op, + LoopFuncType<InputScalarType, OutputScalarType, OperatorType> func, + BroadcastLoopFuncType<InputScalarType, OutputScalarType, OperatorType> broadcast_func) +{ + const auto all_true_pg = svptrue<InputScalarType>(); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); + + int x = window_start_x; + + svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x); + do + { + broadcast_func(pg, + { + op, + non_broadcast_input_ptr + x, + broadcast_value, + output_ptr + x, + !is_broadcast_input_2 + }); + x += svcnt<InputScalarType>(); + pg = svwhilelt<InputScalarType>(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr()); + + int x = window_start_x; + + svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x); + do + { + func(pg, + { + op, + input1_ptr + x, + input2_ptr + x, + output_ptr + x + }); + x += svcnt<InputScalarType>(); + pg = svwhilelt<InputScalarType>(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + input1, input2, output); + } +} + +template <ArithmeticOperation op, typename ScalarType> +void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + using VectorType = typename sve_vector<ScalarType>::type; + + elementwise_op<VectorType, VectorType, ArithmeticOperation>(in1, in2, out, window, op, + &arithmetic_op_loop<ScalarType, ScalarType>, + &arithmetic_op_broadcast_loop<ScalarType, ScalarType>); +} + +template <ComparisonOperation op, typename InputScalarType, typename OutputScalarType = uint8_t> +void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width"); + using InputVectorType = typename sve_vector<InputScalarType>::type; + using OutputVectorType = typename sve_vector<OutputScalarType>::type; + + elementwise_op<InputVectorType, OutputVectorType, ComparisonOperation>(in1, in2, out, window, op, + &comparison_op_loop<InputScalarType, OutputScalarType>, + &comparison_op_broadcast_loop<InputScalarType, OutputScalarType>); +} + +template <> +svint32_t elementwise_pow<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b) +{ + return svcvt_s32_z(pg, svpow_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b))); +} + +template <> +svint32_t elementwise_div<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b) +{ + return svcvt_s32_z(pg, svdiv_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b))); +} + +template <> +svint16_t elementwise_div<svint16_t>(svbool_t &pg, const svint16_t &a, const svint16_t &b) +{ + ARM_COMPUTE_UNUSED(pg, a, b); + ARM_COMPUTE_ERROR("Not supported"); +} + +template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op<ComparisonOperation::Equal, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Equal, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Equal, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Equal, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Equal, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op<ComparisonOperation::NotEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::NotEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::NotEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::NotEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::NotEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op<ComparisonOperation::Greater, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Greater, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Greater, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Greater, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Greater, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op<ComparisonOperation::Less, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Less, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Less, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Less, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::Less, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template void elementwise_comparison_op<ComparisonOperation::LessEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::LessEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::LessEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::LessEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +template void elementwise_comparison_op<ComparisonOperation::LessEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_SVE) */
\ No newline at end of file diff --git a/src/cpu/kernels/elementwise/sve/elementwise_list.h b/src/cpu/kernels/elementwise/sve/elementwise_list.h new file mode 100644 index 0000000000..f762587ce7 --- /dev/null +++ b/src/cpu/kernels/elementwise/sve/elementwise_list.h @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H +#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/core/NEON/wrapper/svtraits.h" +#include "src/cpu/kernels/elementwise/sve/elementwise_list.h" +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +using namespace arm_compute::wrapper; + +template <typename VectorType> +VectorType elementwise_pow(svbool_t &pg, const VectorType &a, const VectorType &b) +{ + return svpow_z(pg, a, b); +} + +template <typename VectorType> +VectorType elementwise_div(svbool_t &pg, const VectorType &a, const VectorType &b) +{ + return svdiv_z(pg, a, b); +} + +template <uint32_t bytewidth> +svbool_t narrow_to_byte_predicate(svbool_t pg) +{ + const auto all_false = svpfalse(); + + switch(bytewidth) + { + case 8: + pg = svuzp1_b32(pg, all_false); + /* fall through */ + case 4: + pg = svuzp1_b16(pg, all_false); + /* fall through */ + case 2: + pg = svuzp1_b8(pg, all_false); + /* fall through */ + default: + break; + } + return pg; +} + +template <typename VectorType> +VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const VectorType &b, ArithmeticOperation op) +{ + using ScalarType = typename wrapper::sve_scalar<VectorType>::type; + VectorType res{}; + + switch(op) + { + case ArithmeticOperation::MAX: + res = svmax_z(pg, a, b); + break; + case ArithmeticOperation::MIN: + res = svmin_z(pg, a, b); + break; + case ArithmeticOperation::SQUARED_DIFF: + { + const auto tmp = svsub_z(pg, a, b); + res = svmul_z(pg, tmp, tmp); + break; + } + case ArithmeticOperation::PRELU: + { + const auto zero = svdup_n(ScalarType(0)); + const auto tmp = svmul_z(pg, a, b); + const auto gt = svcmpgt(pg, a, zero); + res = svsel(gt, a, tmp); + break; + } + case ArithmeticOperation::DIV: + { + res = elementwise_div(pg, a, b); + break; + } + case ArithmeticOperation::POWER: + { + res = elementwise_pow(pg, a, b); + break; + } + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + + return res; +} + +template <typename InputVectorType, typename OutputVectorType> +OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op) +{ + svbool_t selection_vector{}; + + switch(op) + { + case ComparisonOperation::Equal: + selection_vector = svcmpeq(pg, a, b); + break; + case ComparisonOperation::NotEqual: + selection_vector = svcmpne(pg, a, b); + break; + case ComparisonOperation::Greater: + selection_vector = svcmpgt(pg, a, b); + break; + case ComparisonOperation::GreaterEqual: + selection_vector = svcmpge(pg, a, b); + break; + case ComparisonOperation::Less: + selection_vector = svcmplt(pg, a, b); + break; + case ComparisonOperation::LessEqual: + selection_vector = svcmple(pg, a, b); + break; + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); + } + + using InputScalarType = typename wrapper::sve_scalar<InputVectorType>::type; + selection_vector = narrow_to_byte_predicate<sizeof(InputScalarType)>(selection_vector); + + using OutputScalarType = typename wrapper::sve_scalar<OutputVectorType>::type; + const auto false_vector = svdup_n(static_cast<OutputScalarType>((uint32_t)0)); + const auto true_vector = svdup_n(static_cast<OutputScalarType>(~(uint32_t)0)); + auto ret = svsel(selection_vector, true_vector, false_vector); + + return ret; +} + +template <ArithmeticOperation op, typename ScalarType> +void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); + +template <ComparisonOperation op, typename ScalarType, typename OutputScalarType = uint8_t> +void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window); +} // namespace cpu +} // namespace arm_compute +#endif // defined(ARM_COMPUTE_ENABLE_SVE) +#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H */ diff --git a/src/cpu/kernels/elementwise/sve/elementwise_quantized_list.h b/src/cpu/kernels/elementwise/sve/elementwise_quantized_list.h new file mode 100644 index 0000000000..a5d17a86a7 --- /dev/null +++ b/src/cpu/kernels/elementwise/sve/elementwise_quantized_list.h @@ -0,0 +1,366 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H +#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H + +#if defined(ARM_COMPUTE_ENABLE_SVE2) + +#include "src/core/NEON/wrapper/svtraits.h" +#include "src/cpu/kernels/elementwise/sve/elementwise_list.h" + +namespace arm_compute +{ +namespace cpu +{ +using namespace arm_compute::wrapper; + +template <typename InputScalarType, typename OutputScalarType, typename OperatorType> +struct QuantizedLoopArguments +{ + OperatorType op; + const InputScalarType *input1_ptr; + const InputScalarType *input2_ptr; + OutputScalarType *output_ptr; + + const svint32_t &in1_offset; + const svint32_t &in2_offset; + const svint32_t &out_offset; + const svfloat32_t &in1_scale; + const svfloat32_t &in2_scale; + const svfloat32_t &out_scale; +}; + +template <typename InputScalarType, typename OutputScalarType, typename OperatorType> +struct BroadcastQuantizedLoopArguments +{ + OperatorType op; + const InputScalarType *input1_ptr; + float broadcast_value; + OutputScalarType *output_ptr; + bool reorder; + + const svint32_t &in1_offset; + const svint32_t &out_offset; + const svfloat32_t &in1_scale; + const svfloat32_t &out_scale; +}; + +svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale) +{ + auto x = svld1(pg, ptr); + + const auto widened = svcreate4( + svmovlb(svmovlb(x)), + svmovlt(svmovlb(x)), + svmovlb(svmovlt(x)), + svmovlt(svmovlt(x))); + + pg = svptrue_b8(); + + return svcreate4( + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale)); +} + +svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale) +{ + auto x = svld1(pg, ptr); + + //vprint(x); + + const auto widened = svcreate4( + svmovlb(svmovlb(x)), + svmovlt(svmovlb(x)), + svmovlb(svmovlt(x)), + svmovlt(svmovlt(x))); + + pg = svptrue_b8(); + + return svcreate4( + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale), + svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale)); +} + +void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale) +{ + const auto quantized = svcreate4( + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset)); + + const auto narrowed_bottom = svqxtunt(svqxtunb(svget4(quantized, 0)), svget4(quantized, 1)); + const auto narrowed_top = svqxtunt(svqxtunb(svget4(quantized, 2)), svget4(quantized, 3)); + const auto narrowed = svqxtnt(svqxtnb(narrowed_bottom), narrowed_top); + svst1(pg, ptr, narrowed); +} + +void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale) +{ + const auto quantized = svcreate4( + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset), + svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset)); + + const auto narrowed_bottom = svqxtnt(svqxtnb(svget4(quantized, 0)), svget4(quantized, 1)); + const auto narrowed_top = svqxtnt(svqxtnb(svget4(quantized, 2)), svget4(quantized, 3)); + const auto narrowed = svqxtnt(svqxtnb(narrowed_bottom), narrowed_top); + + svst1(pg, ptr, narrowed); +} + +template <typename InputScalarType, typename OutputScalarType> +inline void arithmetic_op_quantized_loop(svbool_t pg, const QuantizedLoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args) +{ + const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); + const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale); + + const auto result = svcreate4( + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), args.op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), args.op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), args.op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), args.op)); + + store_quantized(args.output_ptr, pg, result, args.out_offset, args.out_scale); +} + +template <typename InputScalarType, typename OutputScalarType> +inline void arithmetic_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args) +{ + const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); + const auto in2 = svcreate4( + svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value)); + + const auto &af = args.reorder ? in2 : in1; + const auto &bf = args.reorder ? in1 : in2; + + const auto result = svcreate4( + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 0), svget4(bf, 0), args.op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 1), svget4(bf, 1), args.op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 2), svget4(bf, 2), args.op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 3), svget4(bf, 3), args.op)); + + store_quantized(args.output_ptr, pg, result, args.out_offset, args.out_scale); +} + +template <typename InputScalarType, typename OutputScalarType> +inline void comparison_op_quantized_loop(svbool_t pg, const QuantizedLoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args) +{ + const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); + const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale); + + using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type; + + const auto result = svcreate4( + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), svget4(in2, 0), args.op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), svget4(in2, 1), args.op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), svget4(in2, 2), args.op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3), svget4(in2, 3), args.op)); + + const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); + const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); + const auto zipped = svzip1(zipped_bottom, zipped_top); + svst1(pg, args.output_ptr, zipped); +} + +template <typename InputScalarType, typename OutputScalarType> +inline void comparison_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args) +{ + const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); + const auto in2 = svcreate4( + svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value)); + + const auto &af = args.reorder ? in2 : in1; + const auto &bf = args.reorder ? in1 : in2; + + using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type; + + const auto result = svcreate4( + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 0), svget4(bf, 0), args.op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 1), svget4(bf, 1), args.op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 2), svget4(bf, 2), args.op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 3), svget4(bf, 3), args.op)); + + const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); + const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); + const auto zipped = svzip1(zipped_bottom, zipped_top); + svst1(pg, args.output_ptr, zipped); +} + +template <typename InputScalarType, typename OutputScalarType, typename OperatorType> +using LoopQuantizedFuncType = void (*)(svbool_t, const QuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType> &); + +template <typename InputScalarType, typename OutputScalarType, typename OperatorType> +using BroadcastQuantizedLoopFuncType = void (*)(svbool_t, const BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType> &); + +template <typename InputVectorType, typename OutputVectorType, typename OperatorType, + typename InputScalarType = typename wrapper::sve_scalar<InputVectorType>::type, + typename OutputScalarType = typename wrapper::sve_scalar<OutputVectorType>::type> +void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, + OperatorType op, + LoopQuantizedFuncType<InputScalarType, OutputScalarType, OperatorType> func, + BroadcastQuantizedLoopFuncType<InputScalarType, OutputScalarType, OperatorType> broadcast_func) +{ + const auto all_true_pg = wrapper::svptrue<InputScalarType>(); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); + + const auto output_voffset = svdup_n(out->info()->quantization_info().uniform().offset); + const auto output_vscale = svdup_n(1.f / out->info()->quantization_info().uniform().scale); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; + + const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info(); + const auto broadcast_qinfo = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info(); + + const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset); + const auto non_broadcast_vscale = svdup_n(non_broadcast_qinfo.uniform().scale); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); + const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); + + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x); + do + { + const auto args = BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType> + { + op, + non_broadcast_input_ptr + x, + Qasymm8QuantizationHelper<InputScalarType>::dequantize(broadcast_value, broadcast_qinfo), + output_ptr + x, + !is_broadcast_input_2, + non_broadcast_voffset, output_voffset, + non_broadcast_vscale, output_vscale + }; + broadcast_func(pg, args); + x += wrapper::svcnt<InputScalarType>(); + pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + const auto in1_voffset = svdup_n(in1->info()->quantization_info().uniform().offset); + const auto in1_vscale = svdup_n(in1->info()->quantization_info().uniform().scale); + + const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset); + const auto in2_vscale = svdup_n(in2->info()->quantization_info().uniform().scale); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); + const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr()); + + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x); + do + { + const auto args = QuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType> + { + op, + input1_ptr + x, + input2_ptr + x, + output_ptr + x, + in1_voffset, in2_voffset, output_voffset, + in1_vscale, in2_vscale, output_vscale + }; + func(pg, args); + x += wrapper::svcnt<InputScalarType>(); + pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + input1, input2, output); + } +} + +template <ArithmeticOperation op, typename ScalarType> +void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + using VectorType = typename wrapper::traits::sve_vector<ScalarType>::type; + elementwise_quantized_op<VectorType, VectorType, ArithmeticOperation>(in1, in2, out, window, op, + &arithmetic_op_quantized_loop<ScalarType, ScalarType>, + &arithmetic_op_broadcast_quantized_loop<ScalarType, ScalarType>); +} + +template <ComparisonOperation op, typename InputScalarType, typename OutputScalarType = uint8_t> +void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width"); + using InputVectorType = typename wrapper::traits::sve_vector<InputScalarType>::type; + using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type; + elementwise_quantized_op<InputVectorType, OutputVectorType, ComparisonOperation>(in1, in2, out, window, op, + &comparison_op_quantized_loop<InputScalarType, OutputScalarType>, + &comparison_op_broadcast_quantized_loop<InputScalarType, OutputScalarType>); +} +} // namespace cpu +} // namespace arm_compute + +#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ +#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */
\ No newline at end of file diff --git a/src/cpu/kernels/elementwise/sve/elementwise_unary.cpp b/src/cpu/kernels/elementwise/sve/elementwise_unary.cpp new file mode 100644 index 0000000000..ddf1febd66 --- /dev/null +++ b/src/cpu/kernels/elementwise/sve/elementwise_unary.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_SVE) +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +template <typename ScalarType, typename VectorType> +inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) +{ + switch(op) + { + case ElementWiseUnary::RSQRT: + return svinvsqrt(pg, a); + case ElementWiseUnary::EXP: + return wrapper::svexp_z(pg, a); + case ElementWiseUnary::NEG: + return svneg_z(pg, a); + case ElementWiseUnary::LOG: + return wrapper::svlog_z(pg, a); + case ElementWiseUnary::ABS: + return svabs_z(pg, a); + case ElementWiseUnary::ROUND: + return svrintn_z(pg, a); + case ElementWiseUnary::SIN: + return wrapper::svsin_z(pg, a); + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED"); + } +} + +template <typename ScalarType, typename VectorType> +inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a) +{ + switch(op) + { + case ElementWiseUnary::NEG: + return svneg_z(pg, a); + case ElementWiseUnary::ABS: + return svabs_z(pg, a); + default: + ARM_COMPUTE_ERROR("NOT_SUPPORTED"); + } +} + +template <typename ScalarType> +void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op) +{ + const auto all_true_pg = wrapper::svptrue<ScalarType>(); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(in, win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto vin = svld1(pg, input_ptr + x); + svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin)); + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + input, output); +} + +template void elementwise_sve_op<float16_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); +template void elementwise_sve_op<float32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); +template void elementwise_sve_op<int32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_SVE) */
\ No newline at end of file diff --git a/src/cpu/kernels/elementwise/sve/elementwise_unary_list.h b/src/cpu/kernels/elementwise/sve/elementwise_unary_list.h new file mode 100644 index 0000000000..c2b495f27c --- /dev/null +++ b/src/cpu/kernels/elementwise/sve/elementwise_unary_list.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H +#define SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H + +#include "arm_compute/core/Types.h" +#if defined(ARM_COMPUTE_ENABLE_SVE) + +namespace arm_compute +{ +namespace cpu +{ +template <typename ScalarType> +void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op); +} // namespace cpu +} // namespace arm_compute +#endif // defined(ARM_COMPUTE_ENABLE_SVE) +#endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
\ No newline at end of file diff --git a/src/cpu/kernels/floor/list.h b/src/cpu/kernels/floor/list.h new file mode 100644 index 0000000000..4367e0ffc9 --- /dev/null +++ b/src/cpu/kernels/floor/list.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_FLOOR_LIST_H +#define SRC_CORE_NEON_KERNELS_FLOOR_LIST_H + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_FLOOR_KERNEL(func_name) \ + void func_name(const void *src, void *dst, int len) + +DECLARE_FLOOR_KERNEL(fp16_neon_floor); +DECLARE_FLOOR_KERNEL(fp32_neon_floor); + +#undef DECLARE_FLOOR_KERNEL +} // namespace cpu +} // namespace arm_compute + +#endif /* SRC_CORE_NEON_KERNELS_FLOOR_LIST_H */ diff --git a/src/cpu/kernels/floor/neon/fp16.cpp b/src/cpu/kernels/floor/neon/fp16.cpp new file mode 100644 index 0000000000..f362676a36 --- /dev/null +++ b/src/cpu/kernels/floor/neon/fp16.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +#include "src/common/utils/Validate.h" +#include "src/core/NEON/NEMath.h" + +#include <arm_neon.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace cpu +{ +constexpr int step = 8; + +void fp16_neon_floor(const void *src, void *dst, int len) +{ + ARM_COMPUTE_ASSERT_NOT_NULLPTR(src); + ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst); + ARM_COMPUTE_ASSERT(len >= 0); + + auto psrc = static_cast<const __fp16 *>(src); + auto pdst = static_cast<__fp16 *>(dst); + + for(; len >= step; len -= step) + { + vst1q_f16(pdst, vfloorq_f16(vld1q_f16(psrc))); + psrc += step; + pdst += step; + } + + for(; len > 0; --len) + { + *pdst = std::floor(*psrc); + ++psrc; + ++pdst; + } +} +} // namespace cpu +} // namespace arm_compute +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ diff --git a/src/cpu/kernels/floor/neon/fp32.cpp b/src/cpu/kernels/floor/neon/fp32.cpp new file mode 100644 index 0000000000..f5efb2e849 --- /dev/null +++ b/src/cpu/kernels/floor/neon/fp32.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/common/utils/Validate.h" +#include "src/core/NEON/NEMath.h" + +#include <arm_neon.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace cpu +{ +constexpr int step = 4; + +void fp32_neon_floor(const void *src, void *dst, int len) +{ + ARM_COMPUTE_ASSERT_NOT_NULLPTR(src); + ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst); + ARM_COMPUTE_ASSERT(len >= 0); + + auto psrc = static_cast<const float *>(src); + auto pdst = static_cast<float *>(dst); + + for(; len >= step; len -= step) + { + vst1q_f32(pdst, vfloorq_f32(vld1q_f32(psrc))); + psrc += step; + pdst += step; + } + + for(; len > 0; --len) + { + *pdst = std::floor(*psrc); + ++pdst; + ++psrc; + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp new file mode 100644 index 0000000000..eed4bb9dd5 --- /dev/null +++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp @@ -0,0 +1,359 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h" + +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/utils/AssemblyUtils.h" + +#include "src/core/NEON/kernels/assembly/depthwise.hpp" + +#include "depthwise_common.hpp" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +constexpr unsigned int idx_width = 1; +constexpr unsigned int idx_height = 2; +constexpr unsigned int idx_channels = 0; +constexpr unsigned int idx_batches = 3; + +template <typename TSrc, typename TWeights, typename TDst> +void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst, + const ConvolutionInfo &info, const CPUInfo &cpu_info, + std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel) +{ + unsigned int stride_cols{}; + unsigned int stride_rows{}; + std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride(); + + const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info); + + const unsigned int n_batches = src->dimension(idx_batches); + const unsigned int src_rows = src->dimension(idx_height); + const unsigned int src_cols = src->dimension(idx_width); + const unsigned int n_channels = src->dimension(idx_channels); + const unsigned int dst_rows = dst->dimension(idx_height); + const unsigned int dst_cols = dst->dimension(idx_width); + + const unsigned int kernel_cols = weights->dimension(idx_width); + const unsigned int kernel_rows = weights->dimension(idx_height); + + const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info); + + arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, + n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier, + padding, activation, nullptr); + + // Configure assembly pooling kernel + auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst>(args); + if(dwc_kernel_asm == nullptr) + { + // Configuration not supported: Leave function unconfigured: + return; + } + + kernel = std::move(dwc_kernel_asm); +} + +template <typename TSrc, typename TWeights, typename TDst> +void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst, + const ConvolutionInfo &info, const CPUInfo &cpu_info, + std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel, + std::vector<int32_t> &multipliers, std::vector<int32_t> &right_shifts, std::vector<int32_t> &left_shifts) +{ + unsigned int stride_cols{}; + unsigned int stride_rows{}; + std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride(); + + const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info); + + const unsigned int n_batches = src->dimension(idx_batches); + const unsigned int src_rows = src->dimension(idx_height); + const unsigned int src_cols = src->dimension(idx_width); + const unsigned int n_channels = src->dimension(idx_channels); + const unsigned int dst_rows = dst->dimension(idx_height); + const unsigned int dst_cols = dst->dimension(idx_width); + + const unsigned int kernel_cols = weights->dimension(idx_width); + const unsigned int kernel_rows = weights->dimension(idx_height); + + const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info); + + arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, + n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier, + padding, activation, nullptr); + + const auto src_qinfo = src->quantization_info().uniform(); + const auto weights_qinfo = weights->quantization_info(); + const auto dst_qinfo = dst->quantization_info().uniform(); + + const unsigned int num_filters = weights_qinfo.scale().size(); + + multipliers.resize(num_filters); + std::vector<int32_t> dst_shifts(num_filters); + quantization::compute_quantized_multipliers_and_shifts(src, + weights, + dst, + multipliers.data(), + dst_shifts.data()); + + // Quantize activation bounds + int32_t min_activation = std::numeric_limits<TSrc>::lowest(); + int32_t max_activation = std::numeric_limits<TSrc>::max(); + if(info.act_info.enabled()) + { + std::tie(min_activation, max_activation) = get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo); + } + + // Set quantization parameters for assembly kernels + arm_gemm::Requantize32 requant_args{}; + if(is_data_type_quantized_per_channel(weights->data_type())) + { + left_shifts.resize(num_filters); + right_shifts.resize(num_filters); + bool need_left_shift = false; // Select more optimized path if left shift is not needed + for(unsigned int i = 0; i < num_filters; ++i) + { + left_shifts[i] = std::max(-dst_shifts[i], static_cast<int32_t>(0)); + right_shifts[i] = std::min(-dst_shifts[i], static_cast<int32_t>(0)); + if(dst_shifts[i] < 0 && !need_left_shift) + { + need_left_shift = true; + } + } + + requant_args = arm_gemm::Requantize32(nullptr, + 0, + src_qinfo.offset, + weights_qinfo.uniform().offset, + dst_qinfo.offset, + (need_left_shift) ? left_shifts.data() : nullptr, + right_shifts.data(), + multipliers.data(), + static_cast<TSrc>(min_activation), + static_cast<TSrc>(max_activation)); + } + else + { + requant_args = arm_gemm::Requantize32(nullptr, + 0, + src_qinfo.offset, + weights_qinfo.uniform().offset, + dst_qinfo.offset, + -dst_shifts[0], + multipliers[0], + static_cast<TSrc>(min_activation), + static_cast<TSrc>(max_activation)); + } + + // Configure assembly pooling kernel with requantization + auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst, arm_gemm::Requantize32>(args, requant_args); + if(dwc_kernel_asm == nullptr) + { + // Configuration not supported: Leave function unconfigured: + return; + } + + kernel = std::move(dwc_kernel_asm); +} +} // namespace + +CpuDepthwiseConv2dAssemblyWrapperKernel::CpuDepthwiseConv2dAssemblyWrapperKernel() + : _kernel_asm(nullptr), + _multipliers(), + _left_shifts(), + _right_shifts() +{ +} + +CpuDepthwiseConv2dAssemblyWrapperKernel::~CpuDepthwiseConv2dAssemblyWrapperKernel() = default; + +void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *, ITensorInfo *dst, + const ConvolutionInfo &info, const CPUInfo &cpu_info) +{ + ARM_COMPUTE_UNUSED(cpu_info); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + + // Destination initialization if not yet initialized + const TensorShape dst_shape = compute_depthwise_convolution_shape(*src, *weights, info); + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); + +#if defined(__aarch64__) + switch(src->data_type()) + { + case DataType::QASYMM8: + if(is_data_type_quantized_per_channel(weights->data_type())) + { + create_arm_dwc_quant<uint8_t, int8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts); + } + else + { + create_arm_dwc_quant<uint8_t, uint8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts); + } + break; + case DataType::QASYMM8_SIGNED: + create_arm_dwc_quant<int8_t, int8_t, int8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts); + break; +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + case DataType::F16: + create_arm_dwc<float16_t, float16_t, float16_t>(src, weights, dst, info, cpu_info, _kernel_asm); + break; +#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + case DataType::F32: + create_arm_dwc<float, float, float>(src, weights, dst, info, cpu_info, _kernel_asm); + break; + default: + break; + } +#endif // defined(__aarch64__) + + Window win = calculate_max_window(*dst, Steps()); + ICpuKernel::configure(win); +} + +Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + +#if !defined(__aarch64__) + ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels"); +#endif // !defined(__aarch64__) + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Only NHWC is supported by assembly kernels"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.dilation != Size2D(1, 1), "Assembly kernels do not support dilation != (1, 1)"); + + if(is_data_type_quantized_per_channel(weights->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size()); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); + } + + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(0)); + + if(is_data_type_quantized(src->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); + } + } + + if(dst->total_size() > 0) + { + const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + } + return Status{}; +} + +void CpuDepthwiseConv2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get()); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_UNUSED(window); + ARM_COMPUTE_UNUSED(info); + + ARM_COMPUTE_ERROR_ON(tensors.empty()); + + const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); + ITensor *workspace = tensors.get_tensor(TensorType::ACL_INT_0); + ITensor *storage = tensors.get_tensor(TensorType::ACL_INT_1); + + const auto src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); + auto dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes(); + auto working_space = workspace->buffer() + workspace->info()->offset_first_element_in_bytes(); + auto parameters_ptr = storage->buffer() + storage->info()->offset_first_element_in_bytes(); + + const auto src_shape = src->info()->tensor_shape(); + const auto dst_shape = dst->info()->tensor_shape(); + const auto src_padding = src->info()->padding(); + const auto dst_padding = dst->info()->padding(); + + const size_t ld_src_col = src_shape[0] + src_padding.left + src_padding.right; + const size_t ld_src_row = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom); + const size_t ld_src_batch = ld_src_row * src_shape[2]; + const size_t ld_dst_col = dst_shape[0] + dst_padding.left + dst_padding.right; + const size_t ld_dst_row = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom); + const size_t ld_dst_batch = ld_dst_row * dst_shape[2]; + + _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch, + parameters_ptr, + dst_ptr, ld_dst_col, ld_dst_row, ld_dst_batch, + working_space, info.thread_id, info.num_threads); +} + +void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row) +{ + _kernel_asm->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weight_row); +} + +size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_storage_size() const +{ + return _kernel_asm->get_storage_size(); +} + +size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads, unsigned int num_input_channels) const +{ + return _kernel_asm->get_working_size(num_threads, num_input_channels); +} + +bool CpuDepthwiseConv2dAssemblyWrapperKernel::is_configured() const +{ + return _kernel_asm != nullptr; +} + +const char *CpuDepthwiseConv2dAssemblyWrapperKernel::name() const +{ + return "CpuDepthwiseConv2dAssemblyWrapperKernel"; +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h new file mode 100644 index 0000000000..8ee24a6613 --- /dev/null +++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H +#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H + +#include "arm_compute/core/Types.h" +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +namespace arm_conv +{ +namespace depthwise +{ +// Forward declarations +class IDepthwiseCommon; +} // depthwise +} // arm_conv + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** This class is a wrapper for the depthwise convolution assembly kernels. */ +class CpuDepthwiseConv2dAssemblyWrapperKernel final : public ICpuKernel +{ +public: + /** Default constructor */ + CpuDepthwiseConv2dAssemblyWrapperKernel(); + ~CpuDepthwiseConv2dAssemblyWrapperKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dAssemblyWrapperKernel); + + /** Initialise the kernel's src and dst. + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. + * Data type supported: same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. + * @param[in] bias Bias tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: same as @p src, S32 when @p src is QASYMM8/QASYMM8_SIGNED. + * @param[out] dst Destination tensor info. Data type supported: same as @p input. + * @param[in] info Depthwise convolution layer meta-data. + * @param[in] cpu_info CPU information needed to select the most appropriate kernel. + */ + void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info, const CPUInfo &cpu_info); + + /** Indicates whether or not this function can be used to process the given parameters. + * + * Similar to @ref CpuDepthwiseConv2dAssemblyWrapperKernel::configure() + * + * @return a status. + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + const char *name() const override; + + /** Pack bias and weights in a storage space for the assembly kernel + * + * @param[in] parameters_ptr Pointer to storage space. + * @param[in] bias_ptr Pointer to bias buffer. + * @param[in] weights_ptr Pointer to weights buffer. + * @param[in] ld_weights_col Columns displacement for the weights tensor. + * @param[in] ld_weights_row Rows displacement for the weights tensor. + */ + void pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weights_row); + + /** Get the amount of storage space required for the rearranged weights and bias. + * + * @return size of workspace + */ + size_t get_storage_size() const; + + /** Get size of the workspace needed by the assembly kernel. + * + * @param[in] num_threads Maximum number of threads that are going to be spawned. + * @param[in] num_input_channels Number of channels of the input tensor. + * + * @return size of workspace + */ + size_t get_working_size(unsigned int num_threads, unsigned int num_input_channels) const; + + /** Was the asm kernel successfully configured? + * + * @return True if the asm kernel is configured and ready to run + */ + bool is_configured() const; + +private: + std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> _kernel_asm; + std::vector<int32_t> _multipliers{}; + std::vector<int32_t> _left_shifts{}; + std::vector<int32_t> _right_shifts{}; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H */ diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp new file mode 100644 index 0000000000..958c04b677 --- /dev/null +++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "src/core/CPP/Validate.h" +#include "src/core/NEON/INEKernel.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +using namespace arm_compute::misc::shape_calculator; + +void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info) +{ + ARM_COMPUTE_UNUSED(cpu_info); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // dst initialization if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, info))); + +#if defined(__aarch64__) + const bool requantize = src->quantization_info() != dst->quantization_info(); + + switch(src->data_type()) + { + case DataType::QASYMM8: + if(requantize) + { + create_arm_pooling_requant<uint8_t, uint8_t>(src, dst, info, cpu_info); + } + else + { + create_arm_pooling<uint8_t, uint8_t>(src, dst, info, cpu_info); + } + break; + case DataType::QASYMM8_SIGNED: + if(requantize) + { + create_arm_pooling_requant<int8_t, int8_t>(src, dst, info, cpu_info); + } + else + { + create_arm_pooling<int8_t, int8_t>(src, dst, info, cpu_info); + } + break; +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + create_arm_pooling<float16_t, float16_t>(src, dst, info, cpu_info); + break; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + case DataType::F32: + create_arm_pooling<float, float>(src, dst, info, cpu_info); + break; + default: + break; + } +#endif // defined(__aarch64__) + + Window win = calculate_max_window(*dst, Steps()); + INEKernel::configure(win); +} + +Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + +#ifndef __aarch64__ + ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels"); +#endif /* __aarch64__ */ + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC), "Only NHWC is supported by assembly kernels"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX), + "Only AVG and MAX pooling are supported by assembly kernels"); + + if(dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + + const auto src_qinfo = src->quantization_info().uniform(); + const auto dst_qinfo = dst->quantization_info().uniform(); + + if(src_qinfo != dst_qinfo) + { + const float multiplier = src_qinfo.scale / dst_qinfo.scale; + int32_t dst_multiplier{}; + int32_t dst_shift{}; + ARM_COMPUTE_RETURN_ERROR_ON(quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift)); + } + else + { + if(src->data_type() == DataType::QASYMM8) + { + const bool has_padding = info.pad_stride_info.has_padding(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info"); + } + } + } + else + { + if(src->data_type() == DataType::QASYMM8) + { + // If dst is not configured, the quantization info are the same + const bool has_padding = info.pad_stride_info.has_padding(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info"); + } + } + return Status{}; +} + +void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get()); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_UNUSED(window); + ARM_COMPUTE_UNUSED(info); + + ARM_COMPUTE_ERROR_ON(tensors.empty()); + + const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC); + ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); + ITensor *workspace = tensors.get_tensor(TensorType::ACL_INT_0); + + const auto in_ptr = src->buffer() + src->info()->offset_first_element_in_bytes(); + auto out_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes(); + auto working_space = workspace->buffer() + workspace->info()->offset_first_element_in_bytes(); + + const auto src_shape = src->info()->tensor_shape(); + const auto dst_shape = dst->info()->tensor_shape(); + const auto src_padding = src->info()->padding(); + const auto dst_padding = dst->info()->padding(); + + const size_t ld_src_col = src_shape[0] + src_padding.left + src_padding.right; + const size_t ld_src_row = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom); + const size_t ld_src_batch = ld_src_row * src_shape[2]; + const size_t ld_dst_col = dst_shape[0] + dst_padding.left + dst_padding.right; + const size_t ld_dst_row = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom); + const size_t ld_dst_batch = ld_dst_row * dst_shape[2]; + + _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch, + out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch, + working_space, info.thread_id, info.num_threads); +} + +size_t CpuPool2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const +{ + return _kernel_asm->get_working_size(num_threads); +} + +bool CpuPool2dAssemblyWrapperKernel::is_configured() const +{ + return _kernel_asm != nullptr; +} + +template <typename Typesrc, typename Typedst> +void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info) +{ + const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX; + + arm_conv::pooling::PoolingWindow window{}; + window.cols = static_cast<unsigned int>(info.pool_size.x()); + window.rows = static_cast<unsigned int>(info.pool_size.y()); + + arm_conv::pooling::PoolingStride stride{}; + std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride(); + + const arm_conv::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() }; + + constexpr unsigned int idx_width = 1; + constexpr unsigned int idx_height = 2; + constexpr unsigned int idx_channels = 0; + constexpr unsigned int idx_batches = 3; + + const unsigned int n_batches = src->dimension(idx_batches); + const unsigned int src_rows = src->dimension(idx_height); + const unsigned int src_cols = src->dimension(idx_width); + const unsigned int n_channels = src->dimension(idx_channels); + const unsigned int dst_rows = dst->dimension(idx_height); + const unsigned int dst_cols = dst->dimension(idx_width); + + arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr); + + // Configure assembly pooling kernel + auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst>(args); + if(pooling_kernel_asm == nullptr) + { + // Configuration not supported: Leave function unconfigured: + return; + } + + _kernel_asm = std::move(pooling_kernel_asm); +} + +template <typename Typesrc, typename Typedst> +void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info) +{ + const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX; + + arm_conv::pooling::PoolingWindow window{}; + window.cols = static_cast<unsigned int>(info.pool_size.x()); + window.rows = static_cast<unsigned int>(info.pool_size.y()); + + arm_conv::pooling::PoolingStride stride{}; + std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride(); + + const arm_conv::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() }; + + constexpr unsigned int idx_width = 1; + constexpr unsigned int idx_height = 2; + constexpr unsigned int idx_channels = 0; + constexpr unsigned int idx_batches = 3; + + const unsigned int n_batches = src->dimension(idx_batches); + const unsigned int src_rows = src->dimension(idx_height); + const unsigned int src_cols = src->dimension(idx_width); + const unsigned int n_channels = src->dimension(idx_channels); + const unsigned int dst_rows = dst->dimension(idx_height); + const unsigned int dst_cols = dst->dimension(idx_width); + + arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr); + + const auto src_qinfo = src->quantization_info().uniform(); + const auto dst_qinfo = dst->quantization_info().uniform(); + + const float multiplier = src_qinfo.scale / dst_qinfo.scale; + int32_t dst_multiplier{}; + int32_t dst_shift{}; + quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift); + + const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset, + dst_qinfo.offset, + dst_shift, // left shift + 0, // right shift + dst_multiplier); + + // Configure assembly pooling kernel with requantization + auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args); + if(pooling_kernel_asm == nullptr) + { + // Configuration not supported: Leave function unconfigured: + return; + } + + _kernel_asm = std::move(pooling_kernel_asm); +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h new file mode 100644 index 0000000000..ab3ed25b1e --- /dev/null +++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H +#define ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H + +#include "arm_compute/core/Types.h" +#include "src/core/NEON/kernels/assembly/pooling.hpp" +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuKernel.h" + +#include "pool_common.hpp" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +/** This class is a wrapper for the assembly kernels. + * + * Some kernels were written in assembly and highly optimised for specific + * CPUs like A53 or A55. The arm compute library creates an instance of + * CpuPool2dAssemblyWrapperKernel and other auxiliary data structures to + * execute a single assembly kernel in the context of an NEFunction. + * + */ +class CpuPool2dAssemblyWrapperKernel final : public ICpuKernel +{ +public: + /** Constructor + */ + CpuPool2dAssemblyWrapperKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dAssemblyWrapperKernel); + + const char *name() const override + { + return "CpuPool2dAssemblyWrapperKernel"; + } + + /** Initialise the kernel's src and dst. + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] dst Destination tensor info to store the result of pooling. Data types supported: same as @p src. + * @param[in] info Pooling meta-data. + * @param[in] cpu_info CPU information needed to select the most appropriate kernel. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info); + + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuPool2dAssemblyWrapperKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + + /** Get size of the workspace needed by the assembly kernel. + * + * @param[in] num_threads Maximum number of threads that are going to be spawned. + * + * @return size of workspace + */ + size_t get_working_size(unsigned int num_threads) const; + + /** Was the asm kernel successfully configured? + * + * @return True if the asm kernel is configured and ready to run + */ + bool is_configured() const; + +private: + /** Helper function to create the assembly kernel. + * + * @param[in] src Source tensor info. + * @param[in] dst Destination tensor info. + * @param[in] info Pooling layer meta-data. + */ + template <typename Typesrc, typename Typedst> + void create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info); + + /** Helper function to create the assembly kernel with requantization support + * + * @param[in] src Source tensor info. + * @param[in] dst Destination tensor info. + * @param[in] info Pooling layer meta-data. + */ + template <typename Typesrc, typename Typedst> + void create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info); + + std::unique_ptr<arm_conv::pooling::IPoolingCommon> _kernel_asm{ nullptr }; +}; +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H */ diff --git a/src/cpu/kernels/pool2d/neon/fp16.cpp b/src/cpu/kernels/pool2d/neon/fp16.cpp new file mode 100644 index 0000000000..534d24ab49 --- /dev/null +++ b/src/cpu/kernels/pool2d/neon/fp16.cpp @@ -0,0 +1,317 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/pool2d/neon/list.h" + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +void pooling2_f16_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +{ + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 8; + + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, window_src); + Iterator out(dst0, window_out); + Iterator indices(dst1, window_out); + + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + + const int pad_right = src->info()->padding().right; + const int pad_left = src->info()->padding().left; + const int pad_horizontal = pad_right + pad_left; + const int in_stride_y = static_cast<int>(src->info()->strides_in_bytes().y()); + const int in_stride_z = static_cast<int>(src->info()->strides_in_bytes().z()); + + execute_window_loop(window_out, [&](const Coordinates & id) + { + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + const int pool_limit_y = pool_pad_top - idx_height; + const int pool_limit_x = pool_pad_left - idx_width; + + const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); + const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); + const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z()); + const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().z()); + const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().z()); + const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().z()); + + int x_off = window_start_x; + for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) + { + const auto in_x0_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off; + const auto in_x1_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off; + const auto in_x2_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off; + const auto in_x3_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off; + const auto v_x0 = vld1q_f16(in_x0_ptr); + const auto v_x1 = vld1q_f16(in_x1_ptr); + const auto v_x2 = vld1q_f16(in_x2_ptr); + const auto v_x3 = vld1q_f16(in_x3_ptr); + float16x8_t vres = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1)); + // Store result + vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres); + + const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC); + const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off; + const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal; + const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_horizontal * src->info()->tensor_shape()[1]; + const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal; + const uint32x4_t voffset_x0_0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 }; + const uint32x4_t voffset_x0_1 = { offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 }; + const uint16x8_t voffset_x0 = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1)); + const uint32x4_t voffset_x1_0 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 }; + const uint32x4_t voffset_x1_1 = { offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 }; + const uint16x8_t voffset_x1 = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1)); + const uint32x4_t voffset_x2_0 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 }; + const uint32x4_t voffset_x2_1 = { offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 }; + const uint16x8_t voffset_x2 = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1)); + const uint32x4_t voffset_x3_0 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 }; + const uint32x4_t voffset_x3_1 = { offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 }; + const uint16x8_t voffset_x3 = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1)); + const uint16x8_t tmp_indices0 = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1); + const uint16x8_t tmp_indices1 = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3); + const uint16x8_t tmp_indices2 = vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1); + const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2)); + const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2)); + // Store indicies + vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indeces3_0); + vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16) + x_off, tmp_indeces3_1); + } + + // Left-overs loop + for(; x_off < window_end_x; ++x_off) + { + const auto x0 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off); + const auto x1 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off); + const auto x2 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off); + const auto x3 = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off); + float16_t res = std::max(std::max(x2, x3), std::max(x0, x1)); + + // Store result + *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res; + + const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC); + const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float16_t) + x_off; + const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal; + const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_horizontal * src->info()->tensor_shape()[1]; + const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal; + const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1; + const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3; + const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1; + + // Store indices + *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2; + } + }, + in, out, indices); +} +} + +void poolingMxN_fp16_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +{ + if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1) + { + pooling2_f16_maxpool_indices(src, dst0, dst1, pool_info, window_src, window); + } + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 8; + + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, window_src); + Iterator out(dst0, window_out); + + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + + float16x8_t vres; + + execute_window_loop(window_out, [&](const Coordinates & id) + { + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + const int pool_limit_y = pool_pad_top - idx_height; + const int pool_limit_x = pool_pad_left - idx_width; + + const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); + const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y); + const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); + const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x); + + int x_off = window_start_x; + for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) + { + if(pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, + pool_stride_y); + const float16x8_t scale_v = vdupq_n_f16(scale); + + // Perform pooling + vres = vdupq_n_f16(0.0f); + for(int y = pool_start_y; y < pool_end_y; ++y) + { + for(int x = pool_start_x; x < pool_end_x; ++x) + { + const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().z())) + x_off); + + // Get power of 2 in case of l2 pooling and accumulate + if(pool_info.pool_type == PoolingType::L2) + { + vres = vaddq_f16(vres, vmulq_f16(data, data)); + } + else + { + vres = vaddq_f16(vres, data); + } + } + } + // Divide by scale + vres = vmulq_f16(vres, scale_v); + } + else + { + vres = vdupq_n_f16(std::numeric_limits<float>::lowest()); + + for(int y = pool_start_y; y < pool_end_y; ++y) + { + for(int x = pool_start_x; x < pool_end_x; ++x) + { + const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().z())) + x_off); + vres = vmaxq_f16(vres, data); + } + } + } + + // Calculate square-root in case of l2 pooling + if(pool_info.pool_type == PoolingType::L2) + { + float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres); + vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal)); + } + + // Store result + vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres); + } + + // Left-overs loop + for(; x_off < window_end_x; ++x_off) + { + float16_t res = 0.0f; + + if(pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + const float16_t scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, + pool_stride_y); + + for(int y = pool_start_y; y < pool_end_y; ++y) + { + for(int x = pool_start_x; x < pool_end_x; ++x) + { + const float data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().z())) + x_off); + + // Get power of 2 in case of l2 pooling and accumulate + if(pool_info.pool_type == PoolingType::L2) + { + res += data * data; + } + else + { + res += data; + } + } + } + + // Divide by scale + res *= scale; + } + else + { + res = std::numeric_limits<float>::lowest(); + for(int y = pool_start_y; y < pool_end_y; ++y) + { + for(int x = pool_start_x; x < pool_end_x; ++x) + { + const float16_t data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().z())) + x_off); + res = std::max(res, data); + } + } + } + + // Calculate square-root in case of l2 pooling + if(pool_info.pool_type == PoolingType::L2) + { + res = std::sqrt(res); + } + + // Store result + *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res; + } + }, + in, out); +} +} // namespace cpu +} // namespace arm_compute + +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file diff --git a/src/cpu/kernels/pool2d/neon/fp32.cpp b/src/cpu/kernels/pool2d/neon/fp32.cpp new file mode 100644 index 0000000000..26a32ed9d4 --- /dev/null +++ b/src/cpu/kernels/pool2d/neon/fp32.cpp @@ -0,0 +1,314 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/pool2d/neon/list.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +void pooling2_f32_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +{ + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 4; + + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, window_src); + Iterator out(dst0, window_out); + Iterator indices(dst1, window_out); + + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + + float32x4_t vres; + float res; + + const int pad_right = src->info()->padding().right; + const int pad_left = src->info()->padding().left; + const int pad_horizontal = pad_right + pad_left; + const int in_stride_y = static_cast<int>(src->info()->strides_in_bytes().y()); + const int in_stride_z = static_cast<int>(src->info()->strides_in_bytes().z()); + + execute_window_loop(window_out, [&](const Coordinates & id) + { + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + const int pool_limit_y = pool_pad_top - idx_height; + const int pool_limit_x = pool_pad_left - idx_width; + + const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); + const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); + + const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z()); + const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().z()); + const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().z()); + const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().z()); + + int x_off = window_start_x; + for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) + { + const auto in_x0_ptr = reinterpret_cast<const float *>(in.ptr() + in_x0_offset); + const auto in_x1_ptr = reinterpret_cast<const float *>(in.ptr() + in_x1_offset); + const auto in_x2_ptr = reinterpret_cast<const float *>(in.ptr() + in_x2_offset); + const auto in_x3_ptr = reinterpret_cast<const float *>(in.ptr() + in_x3_offset); + const auto v_x0 = vld1q_f32(in_x0_ptr + x_off); + const auto v_x1 = vld1q_f32(in_x1_ptr + x_off); + const auto v_x2 = vld1q_f32(in_x2_ptr + x_off); + const auto v_x3 = vld1q_f32(in_x3_ptr + x_off); + vres = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1)); + // Store result + vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres); + + const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC); + const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float) + x_off; + const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_horizontal; + const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1]; + const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_horizontal; + const uint32x4_t voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 }; + const uint32x4_t voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 }; + const uint32x4_t voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 }; + const uint32x4_t voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 }; + const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1); + const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3); + const uint32x4_t tmp_indices2 = vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1); + + // Store indices + vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indices2); + } + + // Left-overs loop + for(; x_off < window_end_x; ++x_off) + { + const auto x0 = *(reinterpret_cast<const float *>(in.ptr() + in_x0_offset) + x_off); + const auto x1 = *(reinterpret_cast<const float *>(in.ptr() + in_x1_offset) + x_off); + const auto x2 = *(reinterpret_cast<const float *>(in.ptr() + in_x2_offset) + x_off); + const auto x3 = *(reinterpret_cast<const float *>(in.ptr() + in_x3_offset) + x_off); + res = std::max(std::max(x2, x3), std::max(x0, x1)); + + // Store result + *(reinterpret_cast<float *>(out.ptr()) + x_off) = res; + + const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC); + const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float) + x_off; + const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_horizontal; + const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1]; + const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_horizontal; + const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1; + const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3; + const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1; + + // Store indices + *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2; + } + }, + in, out, indices); +} +} + +void poolingMxN_fp32_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +{ + if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1) + { + pooling2_f32_maxpool_indices(src, dst0, dst1, pool_info, window_src, window); + } + else + { + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 4; + + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, window_src); + Iterator out(dst0, window_out); + + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + + float32x4_t vres; + + execute_window_loop(window_out, [&](const Coordinates & id) + { + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + const int pool_limit_y = pool_pad_top - idx_height; + const int pool_limit_x = pool_pad_left - idx_width; + + const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); + const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y); + const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); + const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x); + + int x_off = window_start_x; + for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) + { + if(pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, + pool_stride_y); + const float32x4_t scale_v = vdupq_n_f32(scale); + + // Perform pooling + vres = vdupq_n_f32(0.0f); + + for(int y = pool_start_y; y < pool_end_y; ++y) + { + for(int x = pool_start_x; x < pool_end_x; ++x) + { + const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().z())) + x_off); + + // Get power of 2 in case of l2 pooling and accumulate + if(pool_info.pool_type == PoolingType::L2) + { + vres = vmlaq_f32(vres, data, data); + } + else + { + vres = vaddq_f32(vres, data); + } + } + } + // Divide by scale + vres = vmulq_f32(vres, scale_v); + } + else + { + vres = vdupq_n_f32(std::numeric_limits<float>::lowest()); + for(int y = pool_start_y; y < pool_end_y; ++y) + { + for(int x = pool_start_x; x < pool_end_x; ++x) + { + const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().z())) + x_off); + vres = vmaxq_f32(vres, data); + } + } + } + + // Calculate square-root in case of l2 pooling + if(pool_info.pool_type == PoolingType::L2) + { + float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))), + static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))), + static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))), + static_cast<float>(sqrt(vgetq_lane_f32(vres, 3))) + }; + vres = l2_res; + } + + // Store result + vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres); + } + + // Left-overs loop + for(; x_off < window_end_x; ++x_off) + { + float res = 0.0f; + + if(pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, + pool_stride_y); + + for(int y = pool_start_y; y < pool_end_y; ++y) + { + for(int x = pool_start_x; x < pool_end_x; ++x) + { + const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().z())) + x_off); + + // Get power of 2 in case of l2 pooling and accumulate + if(pool_info.pool_type == PoolingType::L2) + { + res += data * data; + } + else + { + res += data; + } + } + } + + // Divide by scale + res *= scale; + } + else + { + res = std::numeric_limits<float>::lowest(); + for(int y = pool_start_y; y < pool_end_y; ++y) + { + for(int x = pool_start_x; x < pool_end_x; ++x) + { + const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().z())) + x_off); + res = std::max(res, data); + } + } + } + + // Calculate square-root in case of l2 pooling + if(pool_info.pool_type == PoolingType::L2) + { + res = std::sqrt(res); + } + + // Store result + *(reinterpret_cast<float *>(out.ptr()) + x_off) = res; + } + }, + in, out); + } +} +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/pool2d/neon/list.h b/src/cpu/kernels/pool2d/neon/list.h new file mode 100644 index 0000000000..b79323213e --- /dev/null +++ b/src/cpu/kernels/pool2d/neon/list.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_POOLING_LIST_H +#define SRC_CORE_NEON_KERNELS_POOLING_LIST_H + +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/pool2d/neon/quantized.h" +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_POOLING_KERNEL(func_name) \ + void func_name(const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, const Window &window) + +DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_neon_nhwc); +DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_signed_neon_nhwc); +DECLARE_POOLING_KERNEL(poolingMxN_fp16_neon_nhwc); +DECLARE_POOLING_KERNEL(poolingMxN_fp32_neon_nhwc); + +#if defined(ENABLE_NCHW_KERNELS) + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +DECLARE_POOLING_KERNEL(pooling2_fp16_neon_nchw); +DECLARE_POOLING_KERNEL(pooling3_fp16_neon_nchw); +DECLARE_POOLING_KERNEL(poolingMxN_fp16_neon_nchw); +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ + +DECLARE_POOLING_KERNEL(pooling2_fp32_neon_nchw); +DECLARE_POOLING_KERNEL(pooling3_fp32_neon_nchw); +DECLARE_POOLING_KERNEL(pooling7_fp32_neon_nchw); +DECLARE_POOLING_KERNEL(poolingMxN_fp32_neon_nchw); +#endif /* defined(ENABLE_NCHW_KERNELS) */ + +#undef DECLARE_POOLING_KERNEL + +template <typename T> +inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y, DataLayout data_layout) +{ + const int pad_left = info.padding().left; + const int pad_right = info.padding().right; + const int pad_top = info.padding().top; + const int pad_bottom = info.padding().bottom; + const int in_stride_y = static_cast<int>(info.strides_in_bytes().y()); + const int in_stride_w = static_cast<int>(info.strides_in_bytes()[3]); + const int pad_horiz = pad_left + pad_right; + const int pad_vert = pad_top + pad_bottom; + + if(data_layout == DataLayout::NCHW) + { + const uint32_t offset_base = padded_offset + - sizeof(T) * pad_horiz * id.y() * pool_stride_y /* subtract padding elems per row */ + - pad_top * sizeof(T) /* top padding */ + - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() - pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */ + - in_stride_w * id[3]; + + return offset_base; + } + else + { + const uint32_t offset_base = padded_offset + - sizeof(T) * pad_horiz * id.y() * pool_stride_x // subtract padding elems per row + - pad_top * sizeof(T) // top padding + - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() * pool_stride_y // for each Z plane there are width*pad_right padding elems + - in_stride_w * id[3]; + + return offset_base; + } +} +} // namespace cpu +} // namespace arm_compute + +#endif // SRC_CORE_NEON_KERNELS_POOLING_LIST_H
\ No newline at end of file diff --git a/src/cpu/kernels/pool2d/neon/nchw/all.cpp b/src/cpu/kernels/pool2d/neon/nchw/all.cpp new file mode 100644 index 0000000000..3ca7701087 --- /dev/null +++ b/src/cpu/kernels/pool2d/neon/nchw/all.cpp @@ -0,0 +1,700 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/pool2d/neon/list.h" + +#ifdef ENABLE_NCHW_KERNELS +namespace arm_compute +{ +namespace cpu +{ +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +void pooling3_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +{ + ARM_COMPUTE_UNUSED(dst1); + ARM_COMPUTE_UNUSED(pool_info.pool_type); + ARM_COMPUTE_UNUSED(pool_info.exclude_padding); + + Iterator in(src, window_src); + Iterator out(dst0, window); + + constexpr const int pool_size = 3; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + + const unsigned char *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))); + const unsigned char *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)); + const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)); + + execute_window_loop(window, [&](const Coordinates & id) + { + float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(src_top_ptr + in.offset())); + float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(src_middle_ptr + in.offset())); + float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset())); + float16x4_t res = {}; + + // Get power of 2 in case of l2 pooling + if(pool_info.pool_type == PoolingType::L2) + { + top_data = vmul_f16(top_data, top_data); + middle_data = vmul_f16(middle_data, middle_data); + bottom_data = vmul_f16(bottom_data, bottom_data); + } + + if(pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, + pool_stride_y); + const float16x4_t scale_v = vdup_n_f16(scale); + // Perform pooling + const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data); + res = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data); + res = vmul_f16(vpadd_f16(res, res), scale_v); + } + else + { + const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data); + res = vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data); + res = vpmax_f16(res, res); + } + + // Calculate square-root in case of l2 pooling + if(pool_info.pool_type == PoolingType::L2) + { + res = vinv_f16(vinvsqrt_f16(res)); + } + + *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0); + }, + in, out); +} + +template <typename T> +inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type +f16_to_f32(float16x4_t in) +{ + float32x2_t out = { static_cast<float>(vget_lane_f16(in, 0)), static_cast<float>(vget_lane_f16(in, 1)) }; + return out; +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +template <typename T> +inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type +f16_to_f32(float32x2_t in) +{ + return in; +} + +template <typename T> +void pooling2_nchw_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +{ + Iterator in(src, window_src); + Iterator out(dst0, window); + Iterator indices(dst1, window); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const uint8_t *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))); + const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)); + const int pad_left = src->info()->padding().left; + const int pad_right = src->info()->padding().right; + const int in_stride_y = static_cast<int>(src->info()->strides_in_bytes().y()); + + execute_window_loop(window, [&](const Coordinates & id) + { + auto top_data = wrapper::vload(reinterpret_cast<const T *>(src_top_ptr + in.offset())); + auto bottom_data = wrapper::vload(reinterpret_cast<const T *>(src_bottom_ptr + in.offset())); + float32x2_t top_data_f32 = f16_to_f32<T>(top_data); + float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data); + + // Calculate max data, compare top first, then bottom, to make sue the first max is recorded. + const float32x2_t max_data_top = vpmax_f32(top_data_f32, top_data_f32); + const float32x2_t max_data_bottom = vpmax_f32(bottom_data_f32, bottom_data_f32); + const float32x2_t max_data = vmax_f32(max_data_top, max_data_bottom); + *(reinterpret_cast<T *>(out.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0)); + + // Calculate max data indice, which will be used in max unpool. + const uint32_t offset_base = offset_no_padding<T>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NCHW); + const uint32_t offset_top = (uint32_t)(offset_base / sizeof(T)); + const uint32_t offset_bottom = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left; + const uint32x2_t voffset_top = { offset_top, offset_top + 1u }; + const uint32x2_t voffset_bottom = { offset_bottom, offset_bottom + 1u }; + const uint32x2_t tmp_indices_top = vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top)); + const uint32x2_t tmp_indices_bottom = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), voffset_bottom, vrev64_u32(voffset_bottom)); + *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0); + }, + in, out, indices); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +void pooling2_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +{ + if(pool_info.pool_type == PoolingType::MAX && dst1) + { + pooling2_nchw_maxpool_indices<float16_t>(src, dst0, dst1, pool_info, window_src, window); + } + else + { + Iterator in(src, window_src); + Iterator out(dst0, window); + constexpr int pool_size = 2; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x, pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + + const unsigned char *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))); + const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)); + + execute_window_loop(window, [&](const Coordinates & id) + { + float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(src_top_ptr + in.offset())); + float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset())); + float16x4_t res = {}; + + // Get power of 2 in case of l2 pooling + if(pool_info.pool_type == PoolingType::L2) + { + top_data = vmul_f16(top_data, top_data); + bottom_data = vmul_f16(bottom_data, bottom_data); + } + + if(pool_info.pool_type != PoolingType::MAX) + { + const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, + pool_stride_y); + const float16x4_t scale_v = vdup_n_f16(scale); + + const float16x4_t sum_data = vadd_f16(top_data, bottom_data); + res = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v); + } + else + { + const float16x4_t max_data = vmax_f16(top_data, bottom_data); + res = vpmax_f16(max_data, max_data); + } + + // Calculate square-root in case of l2 pooling + if(pool_info.pool_type == PoolingType::L2) + { + res = vinv_f16(vinvsqrt_f16(res)); + } + + // Store result + *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0); + }, + in, out); + } +} + +void poolingMxN_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +{ + ARM_COMPUTE_UNUSED(dst1); + Iterator in(src, window_src); + Iterator out(dst0, window); + + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + + execute_window_loop(window, [&](const Coordinates & id) + { + float16_t res = 0.0f; + float16x8_t vres = vdupq_n_f16(0.0f); + + if(pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, + pool_stride_y); + + // Perform pooling + + for(int y = 0; y < pool_size_y; ++y) + { + int x = 0; + for(; x <= (pool_size_x - 8); x += 8) + { + const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().y()))); + + // Get power of 2 in case of l2 pooling and accumulate + if(pool_info.pool_type == PoolingType::L2) + { + vres = vaddq_f16(vres, vmulq_f16(data, data)); + } + else + { + vres = vaddq_f16(vres, data); + } + } + + // Leftover for loop + for(; x < pool_size_x; ++x) + { + float16_t data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()))); + + // Get power of 2 in case of l2 pooling + if(pool_info.pool_type == PoolingType::L2) + { + data *= data; + } + + res += data; + } + } + + // Reduction + float16x4_t tmp = vpadd_f16(vget_high_f16(vres), vget_low_f16(vres)); + res += vget_lane_f16(tmp, 0); + res += vget_lane_f16(tmp, 1); + res += vget_lane_f16(tmp, 2); + res += vget_lane_f16(tmp, 3); + + // Divide by scale + res *= scale; + } + else + { + float16x8_t vres = vdupq_n_f16(std::numeric_limits<float>::lowest()); + res = std::numeric_limits<float>::lowest(); + + for(int y = 0; y < pool_size_y; ++y) + { + int x = 0; + for(; x <= (pool_size_x - 8); x += 8) + { + const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().y()))); + vres = vmaxq_f16(vres, data); + } + + // Leftover for loop + for(; x < pool_size_x; ++x) + { + const float16_t data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()))); + res = std::max(res, data); + } + } + + float16x4_t tmp = vpmax_f16(vget_high_f16(vres), vget_low_f16(vres)); + res = std::max(res, vget_lane_f16(tmp, 0)); + res = std::max(res, vget_lane_f16(tmp, 1)); + res = std::max(res, vget_lane_f16(tmp, 2)); + res = std::max(res, vget_lane_f16(tmp, 3)); + } + + // Calculate square-root in case of l2 pooling + if(pool_info.pool_type == PoolingType::L2) + { + res = std::sqrt(res); + } + + // Store result + *(reinterpret_cast<float16_t *>(out.ptr())) = res; + }, + in, out); +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +void poolingMxN_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +{ + ARM_COMPUTE_UNUSED(dst1); + Iterator in(src, window_src); + Iterator out(dst0, window); + + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + + execute_window_loop(window, [&](const Coordinates & id) + { + float res = 0.0f; + + if(pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, + pool_stride_y); + + // Perform pooling + float32x4_t vres = vdupq_n_f32(0.0f); + + for(int y = 0; y < pool_size_y; ++y) + { + int x = 0; + for(; x <= (pool_size_x - 4); x += 4) + { + const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().y()))); + + // Get power of 2 in case of l2 pooling and accumulate + if(pool_info.pool_type == PoolingType::L2) + { + vres = vmlaq_f32(vres, data, data); + } + else + { + vres = vaddq_f32(vres, data); + } + } + + // Leftover for loop + for(; x < pool_size_x; ++x) + { + float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().y()))); + + // Get power of 2 in case of l2 pooling + if(pool_info.pool_type == PoolingType::L2) + { + data *= data; + } + + res += data; + } + } + +#if defined(__aarch64__) + // Reduction operation available on 64 bit architectures only + res += vaddvq_f32(vres); +#else // __aarch64__ + // Reduction + float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres)); + tmp = vpadd_f32(tmp, tmp); + + res += vget_lane_f32(tmp, 0); +#endif // __aarch64__ + // Divide by scale + res *= scale; + } + else + { + float32x4_t vres = vdupq_n_f32(std::numeric_limits<float>::lowest()); + res = std::numeric_limits<float>::lowest(); + + for(int y = 0; y < pool_size_y; ++y) + { + int x = 0; + for(; x <= (pool_size_x - 4); x += 4) + { + const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().y()))); + vres = vmaxq_f32(vres, data); + } + + // Leftover for loop + for(; x < pool_size_x; ++x) + { + const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().y()))); + res = std::max(res, data); + } + } +#if defined(__aarch64__) + // Reduction operation available on 64 bit architectures only + res = std::max(vmaxvq_f32(vres), res); +#else // __aarch64__ + float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres)); + tmp = vpmax_f32(tmp, tmp); + + res = std::max(res, vget_lane_f32(tmp, 0)); +#endif // __aarch64__ + } + + // Calculate square-root in case of l2 pooling + if(pool_info.pool_type == PoolingType::L2) + { + res = std::sqrt(res); + } + + // Store result + *(reinterpret_cast<float *>(out.ptr())) = res; + }, + in, out); +} + +void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +{ + if(pool_info.pool_type == PoolingType::MAX && dst1) + { + pooling2_nchw_maxpool_indices<float>(src, dst0, dst1, pool_info, window_src, window); + } + else + { + Iterator in(src, window_src); + Iterator out(dst0, window); + constexpr int pool_size = 2; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + + const uint8_t *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))); + const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)); + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto in_top_ptr = reinterpret_cast<const float *>(src_top_ptr + in.offset()); + const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset()); + float32x2_t top_data = vld1_f32(in_top_ptr); + float32x2_t bottom_data = vld1_f32(in_bottom_ptr); + float32x2_t res = {}; + float final_res = 0; + // Get power of 2 in case of l2 pooling + if(pool_info.pool_type == PoolingType::L2) + { + top_data = vmul_f32(top_data, top_data); + bottom_data = vmul_f32(bottom_data, bottom_data); + } + + if(pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, + pool_stride_y); + const float32x2_t scale_v = vdup_n_f32(scale); + + // Perform pooling + const float32x2_t sum_data = vadd_f32(top_data, bottom_data); + res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v); + } + else + { + const float32x2_t max_data = vmax_f32(top_data, bottom_data); + res = vpmax_f32(max_data, max_data); + } + final_res = vget_lane_f32(res, 0); + + // Calculate square-root in case of l2 pooling + if(pool_info.pool_type == PoolingType::L2) + { + final_res = sqrt(final_res); + } + + // Store result + *(reinterpret_cast<float *>(out.ptr())) = final_res; + }, + in, out); + } +} + +void pooling3_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +{ + ARM_COMPUTE_UNUSED(dst1); + Iterator in(src, window_src); + Iterator out(dst0, window); + + constexpr const int pool_size = 3; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + + const uint8_t *const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))); + const uint8_t *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)); + const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)); + + execute_window_loop(window, [&](const Coordinates & id) + { + float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(src_top_ptr + in.offset())); + float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(src_middle_ptr + in.offset())); + float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(src_bottom_ptr + in.offset())); + float32x2_t res = {}; + float final_res = 0; + + // Get power of 2 in case of l2 pooling + if(pool_info.pool_type == PoolingType::L2) + { + top_data = vmulq_f32(top_data, top_data); + middle_data = vmulq_f32(middle_data, middle_data); + bottom_data = vmulq_f32(bottom_data, bottom_data); + } + + if(pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, + pool_stride_y); + const float32x2_t scale_v = vdup_n_f32(scale); + + // Perform pooling + const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data); + res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data)); + res = vmul_f32(vpadd_f32(res, res), scale_v); + } + else + { + const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data); + res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data)); + res = vpmax_f32(res, res); + } + final_res = vget_lane_f32(res, 0); + + // Calculate square-root in case of l2 pooling + if(pool_info.pool_type == PoolingType::L2) + { + final_res = sqrt(final_res); + } + + // Store result + *(reinterpret_cast<float *>(out.ptr())) = final_res; + }, + in, out); +} + +void pooling7_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +{ + ARM_COMPUTE_UNUSED(dst1); + Iterator in(src, window_src); + Iterator out(dst0, window); + + constexpr const int pool_size = 7; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + + std::array<const uint8_t *, pool_size> src_ptrs{ {} }; + for(int i = 0; i < pool_size; ++i) + { + src_ptrs[i] = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i)); + } + + execute_window_loop(window, [&](const Coordinates & id) + { + float32x2_t res = {}; + float final_res = 0.f; + if(pool_info.pool_type != PoolingType::MAX) + { + // Calculate scale + float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, + pool_stride_y); + const float32x2_t scale_v = vdup_n_f32(scale); + + // Perform pooling + float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[0] + in.offset())); + // Get power of 2 in case of l2 pooling + if(pool_info.pool_type == PoolingType::L2) + { + data.val[0] = vmulq_f32(data.val[0], data.val[0]); + data.val[1] = vmulq_f32(data.val[1], data.val[1]); + } + float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3)); + for(int i = 1; i < pool_size; ++i) + { + data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[i] + in.offset())); + // Get power of 2 in case of l2 pooling + if(pool_info.pool_type == PoolingType::L2) + { + data.val[0] = vmulq_f32(data.val[0], data.val[0]); + data.val[1] = vmulq_f32(data.val[1], data.val[1]); + } + sum_data = vaddq_f32(sum_data, data.val[0]); + sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3)); + } + res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data)); + res = vmul_f32(vpadd_f32(res, res), scale_v); + } + else + { + float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[0] + in.offset())); + for(int i = 1; i < pool_size; ++i) + { + const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[i] + in.offset())); + max_data = vmax2q_f32(max_data, data); + } + res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1])); + res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0]))); + res = vpmax_f32(res, res); + } + final_res = vget_lane_f32(res, 0); + + // Calculate square-root in case of l2 pooling + if(pool_info.pool_type == PoolingType::L2) + { + final_res = sqrt(final_res); + } + + // Store result + *(reinterpret_cast<float *>(out.ptr())) = final_res; + }, + in, out); +} +} // namespace cpu +} // namespace arm_compute + +#endif // ENABLE_NCHW_KERNELS
\ No newline at end of file diff --git a/src/cpu/kernels/pool2d/neon/qasymm8.cpp b/src/cpu/kernels/pool2d/neon/qasymm8.cpp new file mode 100644 index 0000000000..7f8841edd8 --- /dev/null +++ b/src/cpu/kernels/pool2d/neon/qasymm8.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/pool2d/neon/list.h" + +namespace arm_compute +{ +namespace cpu +{ +void poolingMxN_qasymm8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +{ + poolingMxN_q8_neon_nhwc<uint8_t>(src, dst0, dst1, pool_info, window_src, window); +} +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp b/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..8643651f27 --- /dev/null +++ b/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/pool2d/neon/list.h" + +namespace arm_compute +{ +namespace cpu +{ +void poolingMxN_qasymm8_signed_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +{ + poolingMxN_q8_neon_nhwc<int8_t>(src, dst0, dst1, pool_info, window_src, window); +} +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/pool2d/neon/quantized.h b/src/cpu/kernels/pool2d/neon/quantized.h new file mode 100644 index 0000000000..a16960a205 --- /dev/null +++ b/src/cpu/kernels/pool2d/neon/quantized.h @@ -0,0 +1,863 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_QUANTIZED_H +#define SRC_CORE_NEON_KERNELS_QUANTIZED_H + +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +template <typename T> +inline typename std::enable_if<std::is_same<T, int8_t>::value, int8_t>::type +quantize(float val, const UniformQuantizationInfo &info) +{ + return quantize_qasymm8_signed(val, info); +} + +template <typename T> +inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8_t>::type +quantize(float val, const UniformQuantizationInfo &info) +{ + return quantize_qasymm8(val, info); +} + +template <typename T> +inline T vcvtq_q32_f32(float32x4_t values); + +template <> +inline uint32x4_t vcvtq_q32_f32(float32x4_t values) +{ + return vcvtq_u32_f32(values); +} + +template <> +inline int32x4_t vcvtq_q32_f32(float32x4_t values) +{ + return vcvtq_s32_f32(values); +} + +template <typename T> +inline float32x4_t vcvtq_f32_q32(T values); + +template <> +inline float32x4_t vcvtq_f32_q32(uint32x4_t values) +{ + return vcvtq_f32_u32(values); +} + +template <> +inline float32x4_t vcvtq_f32_q32(int32x4_t values) +{ + return vcvtq_f32_s32(values); +} + +template <typename Tout> +inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset); + +template <> +inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset) +{ + const float new_scale = quant_rescale / scale_pooling; + return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset)); +} + +template <> +inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset) +{ + const float new_scale = quant_rescale / scale_pooling; + return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset)); +} + +template <typename Tin, typename Tout> +inline Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo); + +template <> +inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo) +{ + const float32x4x4_t acc = + { + { + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))), + } + }; + return vquantize(acc, requant_qinfo); +} + +template <> +inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo) +{ + const float32x4x4_t acc = + { + { + vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))), + vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))), + } + }; + return vquantize_signed(acc, requant_qinfo); +} + +template <typename T> +inline T vrequantize_pooling(T &vec, const UniformQuantizationInfo &requant_qinfo); + +template <> +inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo) +{ + const float32x4x2_t acc = + { + { + vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))), + } + }; + return vquantize(acc, requant_qinfo); +} + +template <> +inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo) +{ + const float32x4x2_t acc = + { + { + vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))), + } + }; + return vquantize_signed(acc, requant_qinfo); +} + +inline float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, + const int pad_x, const int pad_y, const int stride_x, const int stride_y) +{ + const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + int start_x = id[idx_width] * stride_x - pad_x; + int start_y = id[idx_height] * stride_y - pad_y; + + const int end_x = std::min(start_x + pool_size_x, upper_bound_w); + const int end_y = std::min(start_y + pool_size_y, upper_bound_h); + if(exclude_padding) + { + start_x = std::max(0, start_x); + start_y = std::max(0, start_y); + } + return 1.f / ((end_y - start_y) * (end_x - start_x)); +} + +template <typename T> +void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +{ + ARM_COMPUTE_UNUSED(dst1); + + const int window_start_x = window.x().start(); + const int window_end_x = window.x().end(); + const int window_step_x = 16; + const int window_half_step_x = window_step_x / 2; + + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator in(src, window_src); + Iterator out(dst0, window_out); + + using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type; + using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type; + using q16_t = typename wrapper::traits::promote_t<T>; + using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type; + using q32_t = typename wrapper::traits::promote_t<q16_t>; + using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type; + + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + + const float32x4_t half_scale_v = vdupq_n_f32(0.5f); + const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform(); + + const float quant_rescale = dst_qinfo.scale / src_qinfo.scale; + // "new_offset" doesn't have to consider the "half_scale_v" in its computation + // With a requantization performed in a single step there won't be uncertainties introduced + const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale); + + const float requant_scale = dst_qinfo.scale / src_qinfo.scale; + const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale); + const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); + + execute_window_loop(window_out, [&](const Coordinates & id) + { + const int idx_width = id.y() * pool_stride_x; + const int idx_height = id.z() * pool_stride_y; + const int pool_limit_y = pool_pad_top - idx_height; + const int pool_limit_x = pool_pad_left - idx_width; + + const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y); + const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y); + const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x); + const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x); + + int x_off = window_start_x; + for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) + { + if(pool_info.pool_type != PoolingType::MAX) + { + q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + + // Calculate scale + const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, + pool_stride_y); + + // Perform pooling + for(int y = pool_start_y; y < pool_end_y; ++y) + { + for(int x = pool_start_x; x < pool_end_x; ++x) + { + const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().z())) + x_off); + + const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data)); + const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data)); + vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16))); + vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16))); + vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16))); + vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16))); + } + } + + if(src_qinfo != dst_qinfo) + { + const float32x4x4_t vres = + { + { + vcvtq_f32_q32(vres1), + vcvtq_f32_q32(vres2), + vcvtq_f32_q32(vres3), + vcvtq_f32_q32(vres4), + } + }; + const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset); + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst)); + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst)); + } + else + { + const float32x4_t scale_v = vdupq_n_f32(scale); + // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero + vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v)); + vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v)); + vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v)); + vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v)); + + const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2))); + const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4))); + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1); + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2); + } + } + else + { + q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{}); + + for(int y = pool_start_y; y < pool_end_y; ++y) + { + for(int x = pool_start_x; x < pool_end_x; ++x) + { + const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().z())) + x_off); + vres = wrapper::vmax(vres, data); + } + } + + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres), + requant_qinfo) : + vres); + } + } + + if(pool_info.pool_type == PoolingType::MAX) + { + for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x) + { + q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{}); + for(int y = pool_start_y; y < pool_end_y; ++y) + { + for(int x = pool_start_x; x < pool_end_x; ++x) + { + const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().z())) + x_off); + vres = wrapper::vmax(vres, data); + } + } + + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, + (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres); + } + } + + // Left-overs loop + for(; x_off < window_end_x; ++x_off) + { + if(pool_info.pool_type != PoolingType::MAX) + { + q32_t res = static_cast<q32_t>(0.f); + + // Calculate scale + const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, + pool_stride_y); + + // Perform pooling + for(int y = pool_start_y; y < pool_end_y; ++y) + { + for(int x = pool_start_x; x < pool_end_x; ++x) + { + const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().z())) + x_off); + res += data; + } + } + + if(src_qinfo != dst_qinfo) + { + const float res_f = static_cast<float>(res); + const float new_scale = quant_rescale / scale; + const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset)); + + // Store result + *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst; + } + else + { + // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero + res = static_cast<T>(0.5f + static_cast<float>(res) * scale); + + // Store result + *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; + } + } + else + { + T res = std::numeric_limits<T>::min(); + + for(int y = pool_start_y; y < pool_end_y; ++y) + { + for(int x = pool_start_x; x < pool_end_x; ++x) + { + const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().z())) + x_off); + res = std::max(res, data); + } + } + + // Store result + if(src_qinfo != dst_qinfo) + { + const float res_f = static_cast<float>(res); + *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo); + } + else + { + *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; + } + } + } + + }, + in, out); +} + +#if defined(ENABLE_NCHW_KERNELS) +template <typename T, typename TVec> +inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step, + const int pool_size, const int upper_bound_w, const int upper_bound_h, + const int pad_x, const int pad_y, const int stride_x, const int stride_y) +{ + int start_x = (id.x() + id_offset) * stride_x - pad_x; + int start_y = id.y() * stride_y - pad_y; + const int end_y = std::min(start_y + pool_size, upper_bound_h); + if(exclude_padding) + { + start_y = std::max(0, start_y); + } + + std::array<T, 8> elems = + { + { + wrapper::vgetlane(v, 0), + wrapper::vgetlane(v, 1), + wrapper::vgetlane(v, 2), + wrapper::vgetlane(v, 3), + wrapper::vgetlane(v, 4), + wrapper::vgetlane(v, 5), + wrapper::vgetlane(v, 6), + wrapper::vgetlane(v, 7), + } + }; + + for(auto &el : elems) + { + int c_start_x = start_x; + const int end_x = std::min(c_start_x + pool_size, upper_bound_w); + if(exclude_padding) + { + c_start_x = std::max(0, c_start_x); + } + float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x)); + el *= scale; + start_x += step * stride_x; + } + + v = wrapper::vsetlane(elems[0], v, 0); + v = wrapper::vsetlane(elems[1], v, 1); + v = wrapper::vsetlane(elems[2], v, 2); + v = wrapper::vsetlane(elems[3], v, 3); + v = wrapper::vsetlane(elems[4], v, 4); + v = wrapper::vsetlane(elems[5], v, 5); + v = wrapper::vsetlane(elems[6], v, 6); + v = wrapper::vsetlane(elems[7], v, 7); +} + +template <typename T> +void pooling2_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +{ + ARM_COMPUTE_UNUSED(dst1); + Iterator in(src, window_src); + Iterator out(dst0, window); + + /** SIMD vector types */ + using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type; + using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type; + using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type; + using q16_t = typename wrapper::traits::promote_t<T>; + using q16x4_t = typename wrapper::traits::neon_vector<q16_t, 4>::type; + using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type; + using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type; + + constexpr int pool_size = 2; + int pool_stride_x = 0; + int pool_stride_y = 0; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + + const T *const src_top_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)))); + const T *const src_bottom_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1))); + + const int scale_step_x = (pool_stride_x == 1) ? 2 : 1; + + const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform(); + const bool have_different_qinfo = src_qinfo != dst_qinfo; + + const float requant_scale = dst_qinfo.scale / src_qinfo.scale; + const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale); + const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto top_data = wrapper::vloadq(src_top_ptr + in.offset()); + const auto bottom_data = wrapper::vloadq(src_bottom_ptr + in.offset()); + q8x8_t lower_res = {}; + q8x8_t upper_res = {}; + + if(pool_info.pool_type != PoolingType::MAX) + { + const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } }; + const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } }; + + // Add rows + const q16x8x2_t vrsum = + { + { + wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), + wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), + } + }; + + // Pair-wise add row data + const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0])); + const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1])); + + q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2); + + // Scale lower result + scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_lower, id, 0, scale_step_x, + pool_size, upper_bound_w, upper_bound_h, + pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + lower_res = wrapper::vmovn(res_lower); + + // Compute upper result for stride_x == 1 + if(pool_stride_x == 1) + { + // Shifted row sum + const q16x8x2_t vrsum_shifted = + { + { + wrapper::vext_1(vrsum.val[0], vrsum.val[1]), + wrapper::vext_1(vrsum.val[1], vrsum.val[1]) + } + }; + + // Pair-wise add shifted row + q16x8_t res_upper = wrapper::vcombine( + wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])), + wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1]))); + + // Scale upper result + scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_upper, id, 1, 2, + pool_size, upper_bound_w, upper_bound_h, + pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + upper_res = wrapper::vmovn(res_upper); + } + } + else + { + const q8x16_t max_data = wrapper::vmax(top_data, bottom_data); + lower_res = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data)); + if(pool_stride_x == 1) + { + const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data); + upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted)); + } + } + + if(have_different_qinfo) + { + const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo); + lower_res = wrapper::vgetlow(requantized_dst); + upper_res = wrapper::vgethigh(requantized_dst); + } + + // Store result + if(pool_stride_x == 1) + { + const q8x8x2_t res = { { lower_res, upper_res } }; + wrapper::vstore(reinterpret_cast<T *>(out.ptr()), res); + } + else + { + wrapper::vstore(reinterpret_cast<T *>(out.ptr()), lower_res); + } + }, + in, out); +} + +template <typename T> +void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +{ + ARM_COMPUTE_UNUSED(dst1); + Iterator in(src, window_src); + Iterator out(dst0, window); + + /** SIMD vector types */ + using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type; + using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type; + using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type; + using q16_t = typename wrapper::traits::promote_t<T>; + using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type; + using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type; + + constexpr int pool_size = 3; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + + const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform(); + + const float requant_scale = dst_qinfo.scale / src_qinfo.scale; + const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale); + const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); + + const T *const src_top_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)))); + const T *const src_middle_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1))); + const T *const src_bottom_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2))); + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto top_data = wrapper::vloadq(src_top_ptr + in.offset()); + const auto middle_data = wrapper::vloadq(src_middle_ptr + in.offset()); + const auto bottom_data = wrapper::vloadq(src_bottom_ptr + in.offset()); + q8x8_t fres = {}; + q8x16_t fqres = {}; + + if(pool_info.pool_type == PoolingType::AVG) + { + // Convert data to u16 + const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } }; + const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } }; + const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } }; + + // Calculate row sums + const q16x8x2_t vrsum = + { + { + wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]), + wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]), + } + }; + const q16x8x2_t vrsum_shifted_1 = + { + { + wrapper::vext_1(vrsum.val[0], vrsum.val[1]), + wrapper::vext_1(vrsum.val[1], vrsum.val[1]) + } + }; + const q16x8x2_t vrsum_shifted_2 = + { + { + wrapper::vext_2(vrsum.val[0], vrsum.val[1]), + wrapper::vext_2(vrsum.val[1], vrsum.val[1]) + } + }; + // Calculate final sum + q16x8x2_t final_sum = + { + { + wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]), + wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]), + } + }; + if(pool_stride_x == 2) + { + q16x8_t res = + { + wrapper::vgetlane(final_sum.val[0], 0), + wrapper::vgetlane(final_sum.val[0], 2), + wrapper::vgetlane(final_sum.val[0], 4), + wrapper::vgetlane(final_sum.val[0], 6), + wrapper::vgetlane(final_sum.val[1], 0), + wrapper::vgetlane(final_sum.val[1], 2), + wrapper::vgetlane(final_sum.val[1], 4), + wrapper::vgetlane(final_sum.val[1], 6), + }; + + scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res, id, 0, 1, + pool_size, upper_bound_w, upper_bound_h, + pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + fres = wrapper::vmovn(res); + } + else + { + // Scale lower result + scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[0], id, 0, 1, + pool_size, upper_bound_w, upper_bound_h, + pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + // Scale lower result + scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[1], id, 8, 1, + pool_size, upper_bound_w, upper_bound_h, + pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y); + fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1])); + } + } + else + { + const q8x16_t max_data = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data); + const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data); + const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data); + const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2); + + if(pool_stride_x == 2) + { + const q8x8x2_t table = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } }; + static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 }; + fres = wrapper::vtbl(table, lookup_val); + } + else + { + fqres = final_max; + } + } + + // Store result + if(pool_stride_x == 1) + { + if(src_qinfo != dst_qinfo) + { + fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo); + } + wrapper::vstore(reinterpret_cast<T *>(out.ptr()), fqres); + } + else + { + if(src_qinfo != dst_qinfo) + { + fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo); + } + wrapper::vstore(reinterpret_cast<T *>(out.ptr()), fres); + } + }, + in, out); +} + +template <typename T> +void poolingMxN_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window) +{ + ARM_COMPUTE_UNUSED(dst1); + Iterator in(src, window_src); + Iterator out(dst0, window); + + /** SIMD vector types */ + using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type; + using q16_t = typename wrapper::traits::promote_t<T>; + using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type; + using q32_t = typename wrapper::traits::promote_t<q16_t>; + using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type; + + const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height; + const int pool_pad_right = pool_info.pad_stride_info.pad_right(); + const int pool_pad_top = pool_info.pad_stride_info.pad_top(); + const int pool_pad_left = pool_info.pad_stride_info.pad_left(); + const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom(); + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride(); + const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right); + const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom); + + const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform(); + + execute_window_loop(window, [&](const Coordinates & id) + { + T res = std::numeric_limits<T>::min(); + + if(pool_info.pool_type != PoolingType::MAX) + { + q32x4_t vres = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + q32_t sres = 0; + + // Calculate scale + const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, + pool_stride_y); + + // Perform pooling + for(int y = 0; y < pool_size_y; ++y) + { + int x = 0; + for(; x <= (pool_size_x - 8); x += 8) + { + const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().y()))); + + const q16x8_t data_q16 = wrapper::vmovl(data); + vres = wrapper::vadd(vres, wrapper::vaddl(wrapper::vgethigh(data_q16), wrapper::vgetlow(data_q16))); + } + + // Leftover for loop + for(; x < pool_size_x; ++x) + { + T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().y()))); + sres += data; + } + } + + // Reduction + const auto tmp = wrapper::vpadd(wrapper::vgethigh(vres), wrapper::vgetlow(vres)); + sres += wrapper::vgetlane(tmp, 0) + wrapper::vgetlane(tmp, 1); + + // Divide by scale + res = static_cast<T>(support::cpp11::round(sres * scale)); + } + else + { + q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{}); + + for(int y = 0; y < pool_size_y; ++y) + { + int x = 0; + for(; x <= (pool_size_x - 8); x += 8) + { + const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().y()))); + vres = wrapper::vmax(vres, data); + } + // Leftover for loop + for(; x < pool_size_x; ++x) + { + const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int> + (src->info()->strides_in_bytes().y()))); + res = std::max(res, data); + } + } + + // Reduce max + vres = wrapper::vpmax(vres, vres); + vres = wrapper::vpmax(vres, vres); + vres = wrapper::vpmax(vres, vres); + + // Get max value + res = std::max(res, wrapper::vgetlane(vres, 0)); + } + // Store result + res = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, src_qinfo), dst_qinfo) : res; + *(reinterpret_cast<T *>(out.ptr())) = res; + }, + in, out); +} +#endif /* defined(ENABLE_NCHW_KERNELS) */ +} // namespace cpu +} // namespace arm_compute + +#endif // SRC_CORE_NEON_KERNELS_QUANTIZED_H diff --git a/src/cpu/kernels/scale/neon/fp16.cpp b/src/cpu/kernels/scale/neon/fp16.cpp new file mode 100644 index 0000000000..0ad66cab1c --- /dev/null +++ b/src/cpu/kernels/scale/neon/fp16.cpp @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/ScaleHelpers.h" +#include "src/core/utils/ScaleUtils.h" +#include "support/Rounding.h" + +#include <arm_neon.h> +#include <cmath> +#include <cstddef> + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +namespace arm_compute +{ +namespace +{ +void fp16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, + float sampling_offset, bool align_corners, const Window &window) +{ + const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; + const size_t in_stride_wc = in_stride_w * in_stride_c; + const size_t in_dim_h = src->info()->dimension(2); + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); + const auto window_start_x = static_cast<int32_t>(window.x().start()); + const auto window_end_x = static_cast<int32_t>(window.x().end()); + const int window_step_x = 8; + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator out(dst, win); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; + + execute_window_loop(win, [&](const Coordinates & id) + { + const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + int32_t x = window_start_x; + const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + + for(; x <= window_end_x - window_step_x; x += window_step_x) + { + wrapper::vstore(reinterpret_cast<float16_t *>(out.ptr()) + x, + wrapper::vloadq(in_ptr + offset + offset_row + x)); + } + for(; x < window_end_x; ++x) + { + *(reinterpret_cast<float16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); + } + }, + out); +} + +void fp16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + + Iterator out(dst, window); + const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const int in_dim_w = src->info()->dimension(1); + const int in_dim_h = src->info()->dimension(2); + const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom); + + // Don't increment in Y and Z direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + Iterator in(src, win_in); + + if(border_mode == BorderMode::CONSTANT) + { + using ConstType = typename std::conditional<std::is_same<float16_t, float16_t>::value, half, float16_t>::type; + + const float16_t const_border_value = static_cast<float16_t>(constant_border_value.get<ConstType>()); + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; + + const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; + const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value; + const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value; + const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; + + *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else if(border_mode == BorderMode::REPLICATE) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + + auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1); + auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1); + auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1); + auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1); + + const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); + const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc); + const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc); + const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc); + + *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} +} +namespace cpu +{ +void fp16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + if(policy == InterpolationPolicy::BILINEAR) + { + fp16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + } + else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + fp16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); + } +} +} // namespace cpu +} // namespace arm_compute + +#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file diff --git a/src/cpu/kernels/scale/neon/integer.cpp b/src/cpu/kernels/scale/neon/integer.cpp new file mode 100644 index 0000000000..a2359aac94 --- /dev/null +++ b/src/cpu/kernels/scale/neon/integer.cpp @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/ScaleHelpers.h" +#include "src/core/utils/ScaleUtils.h" +#include "support/Rounding.h" + +#include <arm_neon.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace +{ +void u8_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, + float sampling_offset, bool align_corners, const Window &window) +{ + const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; + const size_t in_stride_wc = in_stride_w * in_stride_c; + const size_t in_dim_h = src->info()->dimension(2); + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); + const auto window_start_x = static_cast<int32_t>(window.x().start()); + const auto window_end_x = static_cast<int32_t>(window.x().end()); + const int window_step_x = 16; + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator out(dst, win); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; + + execute_window_loop(win, [&](const Coordinates & id) + { + const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + int32_t x = window_start_x; + const uint8_t *in_ptr = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + + for(; x <= window_end_x - window_step_x; x += window_step_x) + { + wrapper::vstore(reinterpret_cast<uint8_t *>(out.ptr()) + x, + wrapper::vloadq(in_ptr + offset + offset_row + x)); + } + for(; x < window_end_x; ++x) + { + *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); + } + }, + out); +} + +void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + + Iterator out(dst, window); + const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const int in_dim_w = src->info()->dimension(1); + const int in_dim_h = src->info()->dimension(2); + const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom); + + // Don't increment in Y and Z direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + Iterator in(src, win_in); + + if(border_mode == BorderMode::CONSTANT) + { + const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>()); + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + const uint8_t *in_ptr = reinterpret_cast<const uint8_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; + + const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; + const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value; + const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value; + const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; + + *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else if(border_mode == BorderMode::REPLICATE) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + + auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1); + auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1); + auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1); + auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1); + + const auto a00 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); + const auto a01 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc); + const auto a10 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc); + const auto a11 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc); + + *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} + +void s16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, + float sampling_offset, bool align_corners, const Window &window) +{ + const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; + const size_t in_stride_wc = in_stride_w * in_stride_c; + const size_t in_dim_h = src->info()->dimension(2); + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); + const auto window_start_x = static_cast<int32_t>(window.x().start()); + const auto window_end_x = static_cast<int32_t>(window.x().end()); + const int window_step_x = 8; + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator out(dst, win); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; + + execute_window_loop(win, [&](const Coordinates & id) + { + const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + int32_t x = window_start_x; + const int16_t *in_ptr = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + + for(; x <= window_end_x - window_step_x; x += window_step_x) + { + wrapper::vstore(reinterpret_cast<int16_t *>(out.ptr()) + x, + wrapper::vloadq(in_ptr + offset + offset_row + x)); + } + for(; x < window_end_x; ++x) + { + *(reinterpret_cast<int16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); + } + }, + out); +} + +void s16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + + Iterator out(dst, window); + const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const int in_dim_w = src->info()->dimension(1); + const int in_dim_h = src->info()->dimension(2); + const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom); + + // Don't increment in Y and Z direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + Iterator in(src, win_in); + + if(border_mode == BorderMode::CONSTANT) + { + const int16_t const_border_value = static_cast<int16_t>(constant_border_value.get<int16_t>()); + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + const int16_t *in_ptr = reinterpret_cast<const int16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; + + const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; + const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value; + const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value; + const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; + + *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else if(border_mode == BorderMode::REPLICATE) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + + auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1); + auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1); + auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1); + auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1); + + const auto a00 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); + const auto a01 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc); + const auto a10 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc); + const auto a11 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc); + + *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} +} +namespace cpu +{ +void u8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + if(policy == InterpolationPolicy::BILINEAR) + { + u8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + } + else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + u8_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); + } +} + +void s16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + if(policy == InterpolationPolicy::BILINEAR) + { + s16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + } + else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + s16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); + } +} +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/scale/neon/list.h b/src/cpu/kernels/scale/neon/list.h new file mode 100644 index 0000000000..c91242f5b2 --- /dev/null +++ b/src/cpu/kernels/scale/neon/list.h @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_SCALE_LIST_H +#define SRC_CORE_NEON_KERNELS_SCALE_LIST_H + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/ScaleHelpers.h" +#include "src/core/utils/ScaleUtils.h" +#include "support/Rounding.h" + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_SCALE_KERNEL(func_name) \ + void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \ + InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \ + bool align_corners, const Window &window) + +DECLARE_SCALE_KERNEL(qasymm8_neon_scale); +DECLARE_SCALE_KERNEL(qasymm8_signed_neon_scale); + +#undef DECLARE_SCALE_KERNEL + +template <typename T> +void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, float sampling_offset, + bool align_corners, const Window &window) +{ + const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; + const size_t in_stride_wc = in_stride_w * in_stride_c; + const size_t in_dim_h = src->info()->dimension(2); + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); + const auto window_start_x = static_cast<int32_t>(window.x().start()); + const auto window_end_x = static_cast<int32_t>(window.x().end()); + const int window_step_x = 16 / sizeof(T); + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator out(dst, win); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; + + execute_window_loop(win, [&](const Coordinates & id) + { + const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + int32_t x = window_start_x; + const T *in_ptr = reinterpret_cast<const T *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + + for(; x <= window_end_x - window_step_x; x += window_step_x) + { + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x, + wrapper::vloadq(in_ptr + offset + offset_row + x)); + } + for(; x < window_end_x; ++x) + { + *(reinterpret_cast<T *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x); + } + }, + out); +} + +template <typename T> +void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + + Iterator out(dst, window); + const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const int in_dim_w = src->info()->dimension(1); + const int in_dim_h = src->info()->dimension(2); + const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom); + + // Don't increment in Y and Z direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + Iterator in(src, win_in); + + if(border_mode == BorderMode::CONSTANT) + { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type; +#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + using ConstType = T; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + const T const_border_value = static_cast<T>(constant_border_value.get<ConstType>()); + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + const T *in_ptr = reinterpret_cast<const T *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; + + const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; + const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value; + const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value; + const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; + + *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else if(border_mode == BorderMode::REPLICATE) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + + auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1); + auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1); + auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1); + auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1); + + const auto a00 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); + const auto a01 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc); + const auto a10 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc); + const auto a11 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc); + + *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} + +template <typename T> +void common_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + if(policy == InterpolationPolicy::BILINEAR) + { + bilinear_neon_scale<T>(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + } + else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + nearest_neon_scale<T>(src, dst, offsets, sampling_offset, align_corners, window); + } +} +} // namespace cpu +} // namespace arm_compute + +#endif /* SRC_CORE_NEON_KERNELS_SCALE_LIST_H */ diff --git a/src/cpu/kernels/scale/neon/qasymm8.cpp b/src/cpu/kernels/scale/neon/qasymm8.cpp new file mode 100644 index 0000000000..fb52752690 --- /dev/null +++ b/src/cpu/kernels/scale/neon/qasymm8.cpp @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/scale/neon/list.h" + +namespace arm_compute +{ +namespace +{ +void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + // Data layout is NHWC + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + Window win_off; + win_off.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_off.set(Window::DimY, Window::Dimension(0, 0, 0)); + + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(1, Window::Dimension(0, 0, 0)); + win_in.set(2, Window::Dimension(0, 0, 0)); + + for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) + { + win_off.set(d, Window::Dimension(0, 0, 0)); + } + + Iterator in(src, win_in); + Iterator out(dst, window); + + const int32_t in_dim_w = src->info()->dimension(1); + const int32_t in_dim_h = src->info()->dimension(2); + const int32_t stride_w = src->info()->strides_in_bytes()[1]; + const int32_t stride_h = src->info()->strides_in_bytes()[2]; + + const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + if(border_mode == BorderMode::CONSTANT) + { + const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>()); + execute_window_loop(window, [&](const Coordinates & id) + { + const int32_t index_h = std::floor((id[2] + sampling_offset) * hr - sampling_offset); + const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2])))); + const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2])))); + const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2])))); + const auto pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr()); + + const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? + (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) : + const_border_value; + const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? + (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) : + const_border_value; + const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ? + (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) : + const_border_value; + const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ? + (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) : + const_border_value; + + const float inp00 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info); + *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + in, out); + } + else if(border_mode == BorderMode::REPLICATE) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const int index_h = std::floor((id[2] + sampling_offset) * hr - sampling_offset); + const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2])))); + const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2])))); + const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2])))); + const auto pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr()); + + auto clamped_w = utility::clamp<int>(index_w, 0, in_dim_w - 1); + auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1); + auto clamped_h = utility::clamp<int>(index_h, 0, in_dim_h - 1); + auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1); + + const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h); + const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h); + const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h); + const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h); + + const float inp00 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info); + *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + in, out); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} +} +namespace cpu +{ +void qasymm8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + if(policy == InterpolationPolicy::BILINEAR) + { + qasymm8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + } + else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + nearest_neon_scale<uint8_t>(src, dst, offsets, sampling_offset, align_corners, window); + } +} +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/scale/neon/qasymm8_signed.cpp b/src/cpu/kernels/scale/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..706bcee954 --- /dev/null +++ b/src/cpu/kernels/scale/neon/qasymm8_signed.cpp @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/kernels/scale/neon/list.h" + +namespace arm_compute +{ +namespace +{ +void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + // Data layout is NHWC + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + Window win_off; + win_off.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_off.set(Window::DimY, Window::Dimension(0, 0, 0)); + + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(1, Window::Dimension(0, 0, 0)); + win_in.set(2, Window::Dimension(0, 0, 0)); + + for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) + { + win_off.set(d, Window::Dimension(0, 0, 0)); + } + + Iterator in(src, win_in); + Iterator out(dst, window); + + const int32_t in_dim_w = src->info()->dimension(1); + const int32_t in_dim_h = src->info()->dimension(2); + const int32_t stride_w = src->info()->strides_in_bytes()[1]; + const int32_t stride_h = src->info()->strides_in_bytes()[2]; + + const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + if(border_mode == BorderMode::CONSTANT) + { + const int8_t const_border_value = static_cast<int8_t>(constant_border_value.get<int8_t>()); + execute_window_loop(window, [&](const Coordinates & id) + { + const int32_t index_h = std::floor((id[2] + sampling_offset) * hr - sampling_offset); + const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2])))); + const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2])))); + const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2])))); + const auto pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr()); + + const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? + (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) : + const_border_value; + const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? + (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) : + const_border_value; + const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ? + (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) : + const_border_value; + const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ? + (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) : + const_border_value; + + const float inp00 = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info); + *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + in, out); + } + else if(border_mode == BorderMode::REPLICATE) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const int index_h = std::floor((id[2] + sampling_offset) * hr - sampling_offset); + const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2])))); + const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2])))); + const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2])))); + const auto pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr()); + + auto clamped_w = utility::clamp<int>(index_w, 0, in_dim_w - 1); + auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1); + auto clamped_h = utility::clamp<int>(index_h, 0, in_dim_h - 1); + auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1); + + const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h); + const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h); + const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h); + const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h); + + const float inp00 = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info); + *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + in, out); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} +} +namespace cpu +{ +void qasymm8_signed_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + if(policy == InterpolationPolicy::BILINEAR) + { + qasymm8_signed_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + } + else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + nearest_neon_scale<int8_t>(src, dst, offsets, sampling_offset, align_corners, window); + } +} +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/scale/sve/fp16.cpp b/src/cpu/kernels/scale/sve/fp16.cpp new file mode 100644 index 0000000000..76e7735b8a --- /dev/null +++ b/src/cpu/kernels/scale/sve/fp16.cpp @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/ScaleHelpers.h" +#include "src/core/utils/ScaleUtils.h" +#include "support/Rounding.h" + +#include <arm_sve.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace +{ +void fp16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, + float sampling_offset, bool align_corners, const Window &window) +{ + const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; + const size_t in_stride_wc = in_stride_w * in_stride_c; + const size_t in_dim_h = src->info()->dimension(2); + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); + const auto window_start_x = static_cast<int32_t>(window.x().start()); + const auto window_end_x = static_cast<int32_t>(window.x().end()); + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator out(dst, win); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; + + execute_window_loop(win, [&](const Coordinates & id) + { + const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do + { + // Store results + svst1_f16(pg, out_ptr + x, svld1_f16(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b16(x, window_end_x); + } + while(svptest_any(svptrue_b16(), pg)); + }, + out); +} + +void fp16_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + + Iterator out(dst, window); + const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const int in_dim_w = src->info()->dimension(1); + const int in_dim_h = src->info()->dimension(2); + const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom); + + // Don't increment in Y and Z direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + Iterator in(src, win_in); + + if(border_mode == BorderMode::CONSTANT) + { + using ConstType = typename std::conditional<std::is_same<float16_t, float16_t>::value, half, float16_t>::type; + + const float16_t const_border_value = static_cast<float16_t>(constant_border_value.get<ConstType>()); + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; + + const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; + const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value; + const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value; + const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; + + *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else if(border_mode == BorderMode::REPLICATE) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + + auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1); + auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1); + auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1); + auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1); + + const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); + const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc); + const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc); + const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc); + + *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} +} +namespace cpu +{ +void fp16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + if(policy == InterpolationPolicy::BILINEAR) + { + fp16_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + } + else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + fp16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); + } +} +} // namespace cpu +} // namespace arm_compute + +#endif // ARM_COMPUTE_ENABLE_SVE
\ No newline at end of file diff --git a/src/cpu/kernels/scale/sve/fp32.cpp b/src/cpu/kernels/scale/sve/fp32.cpp new file mode 100644 index 0000000000..030e109cdf --- /dev/null +++ b/src/cpu/kernels/scale/sve/fp32.cpp @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/ScaleHelpers.h" +#include "src/core/utils/ScaleUtils.h" +#include "support/Rounding.h" + +#include <cmath> +#include <cstddef> + +#include <arm_sve.h> + +namespace arm_compute +{ +namespace +{ +void fp32_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, + float sampling_offset, bool align_corners, const Window &window) +{ + const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; + const size_t in_stride_wc = in_stride_w * in_stride_c; + const size_t in_dim_h = src->info()->dimension(2); + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); + const auto window_start_x = static_cast<int32_t>(window.x().start()); + const auto window_end_x = static_cast<int32_t>(window.x().end()); + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator out(dst, win); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; + + execute_window_loop(win, [&](const Coordinates & id) + { + const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast<const float *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast<float *>(out.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b32(x, window_end_x); + do + { + // Store results + svst1_f32(pg, out_ptr + x, svld1_f32(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b32(x, window_end_x); + } + while(svptest_any(svptrue_b32(), pg)); + }, + out); +} + +void fp32_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + + Iterator out(dst, window); + const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const int in_dim_w = src->info()->dimension(1); + const int in_dim_h = src->info()->dimension(2); + const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom); + + // Don't increment in Y and Z direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + Iterator in(src, win_in); + + if(border_mode == BorderMode::CONSTANT) + { + const float const_border_value = static_cast<float>(constant_border_value.get<float>()); + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + const float *in_ptr = reinterpret_cast<const float *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; + + const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; + const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value; + const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value; + const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; + + *reinterpret_cast<float *>(out.ptr()) = static_cast<float>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else if(border_mode == BorderMode::REPLICATE) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + + auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1); + auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1); + auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1); + auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1); + + const auto a00 = *(reinterpret_cast<const float *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); + const auto a01 = *(reinterpret_cast<const float *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc); + const auto a10 = *(reinterpret_cast<const float *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc); + const auto a11 = *(reinterpret_cast<const float *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc); + + *reinterpret_cast<float *>(out.ptr()) = static_cast<float>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} +} +namespace cpu +{ +void fp32_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + if(policy == InterpolationPolicy::BILINEAR) + { + fp32_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + } + else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + fp32_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); + } +} +} // namespace cpu +} // namespace arm_compute + +#endif // ARM_COMPUTE_ENABLE_SVE
\ No newline at end of file diff --git a/src/cpu/kernels/scale/sve/integer.cpp b/src/cpu/kernels/scale/sve/integer.cpp new file mode 100644 index 0000000000..486c674612 --- /dev/null +++ b/src/cpu/kernels/scale/sve/integer.cpp @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/ScaleHelpers.h" +#include "src/core/utils/ScaleUtils.h" +#include "support/Rounding.h" + +#include <arm_sve.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace +{ +void u8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, + float sampling_offset, bool align_corners, const Window &window) +{ + const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; + const size_t in_stride_wc = in_stride_w * in_stride_c; + const size_t in_dim_h = src->info()->dimension(2); + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); + const auto window_start_x = static_cast<int32_t>(window.x().start()); + const auto window_end_x = static_cast<int32_t>(window.x().end()); + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator out(dst, win); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; + + execute_window_loop(win, [&](const Coordinates & id) + { + const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + // Store results + svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b8(x, window_end_x); + } + while(svptest_any(svptrue_b8(), pg)); + }, + out); +} + +void u8_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + + Iterator out(dst, window); + const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const int in_dim_w = src->info()->dimension(1); + const int in_dim_h = src->info()->dimension(2); + const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom); + + // Don't increment in Y and Z direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + Iterator in(src, win_in); + + if(border_mode == BorderMode::CONSTANT) + { + const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>()); + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + const uint8_t *in_ptr = reinterpret_cast<const uint8_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; + + const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; + const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value; + const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value; + const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; + + *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else if(border_mode == BorderMode::REPLICATE) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + + auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1); + auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1); + auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1); + auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1); + + const auto a00 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); + const auto a01 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc); + const auto a10 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc); + const auto a11 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc); + + *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} + +void s16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, + float sampling_offset, bool align_corners, const Window &window) +{ + const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; + const size_t in_stride_wc = in_stride_w * in_stride_c; + const size_t in_dim_h = src->info()->dimension(2); + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); + const auto window_start_x = static_cast<int32_t>(window.x().start()); + const auto window_end_x = static_cast<int32_t>(window.x().end()); + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator out(dst, win); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; + + execute_window_loop(win, [&](const Coordinates & id) + { + const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast<int16_t *>(out.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do + { + // Store results + svst1_s16(pg, out_ptr + x, svld1_s16(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b16(x, window_end_x); + } + while(svptest_any(svptrue_b16(), pg)); + }, + out); +} + +void s16_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners); + + Iterator out(dst, window); + const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const int in_dim_w = src->info()->dimension(1); + const int in_dim_h = src->info()->dimension(2); + const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom); + + // Don't increment in Y and Z direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + Iterator in(src, win_in); + + if(border_mode == BorderMode::CONSTANT) + { + const int16_t const_border_value = static_cast<int16_t>(constant_border_value.get<int16_t>()); + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + const int16_t *in_ptr = reinterpret_cast<const int16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc; + + const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value; + const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value; + const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value; + const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value; + + *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else if(border_mode == BorderMode::REPLICATE) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z()))); + const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z()))); + const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset); + + auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1); + auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1); + auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1); + auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1); + + const auto a00 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc); + const auto a01 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc); + const auto a10 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc); + const auto a11 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc); + + *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val)); + }, + in, out); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} +} +namespace cpu +{ +void u8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + if(policy == InterpolationPolicy::BILINEAR) + { + u8_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + } + else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + u8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); + } +} + +void s16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + if(policy == InterpolationPolicy::BILINEAR) + { + s16_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + } + else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + s16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); + } +} +} // namespace cpu +} // namespace arm_compute + +#endif // ARM_COMPUTE_ENABLE_SVE
\ No newline at end of file diff --git a/src/cpu/kernels/scale/sve/list.h b/src/cpu/kernels/scale/sve/list.h new file mode 100644 index 0000000000..b9c3a10a78 --- /dev/null +++ b/src/cpu/kernels/scale/sve/list.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_SCALE_LIST_H +#define SRC_CORE_SVE_KERNELS_SCALE_LIST_H + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_SCALE_KERNEL(func_name) \ + void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \ + InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \ + bool align_corners, const Window &window) + +DECLARE_SCALE_KERNEL(fp16_sve_scale); +DECLARE_SCALE_KERNEL(fp32_sve_scale); +DECLARE_SCALE_KERNEL(s16_sve_scale); +DECLARE_SCALE_KERNEL(u8_sve_scale); +DECLARE_SCALE_KERNEL(qasymm8_sve_scale); +DECLARE_SCALE_KERNEL(qasymm8_signed_sve_scale); + +#undef DECLARE_SCALE_KERNEL +} // namespace cpu +} // namespace arm_compute + +#endif /* SRC_CORE_SVE_KERNELS_SCALE_LIST_H */ diff --git a/src/cpu/kernels/scale/sve/qasymm8.cpp b/src/cpu/kernels/scale/sve/qasymm8.cpp new file mode 100644 index 0000000000..c9122ad40b --- /dev/null +++ b/src/cpu/kernels/scale/sve/qasymm8.cpp @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/ScaleHelpers.h" +#include "src/core/helpers/ScaleHelpers.h" +#include "src/core/utils/ScaleUtils.h" +#include "support/Rounding.h" + +#include <arm_sve.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace +{ +void qasymm8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, + float sampling_offset, bool align_corners, const Window &window) +{ + const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; + const size_t in_stride_wc = in_stride_w * in_stride_c; + const size_t in_dim_h = src->info()->dimension(2); + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); + const auto window_start_x = static_cast<int32_t>(window.x().start()); + const auto window_end_x = static_cast<int32_t>(window.x().end()); + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator out(dst, win); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; + + execute_window_loop(win, [&](const Coordinates & id) + { + const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + // Store results + svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b8(x, window_end_x); + } + while(svptest_any(svptrue_b8(), pg)); + }, + out); +} + +void qasymm8_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + // Data layout is NHWC + const int idx_width = 1; + const int idx_height = 2; + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), align_corners); + Window win_off; + win_off.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_off.set(Window::DimY, Window::Dimension(0, 0, 0)); + + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(idx_width, Window::Dimension(0, 0, 0)); + win_in.set(idx_height, Window::Dimension(0, 0, 0)); + + for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) + { + win_off.set(d, Window::Dimension(0, 0, 0)); + } + + Iterator in(src, win_in); + Iterator out(dst, window); + + const int32_t in_dim_w = src->info()->dimension(idx_width); + const int32_t in_dim_h = src->info()->dimension(idx_height); + const int32_t stride_w = src->info()->strides_in_bytes()[idx_width]; + const int32_t stride_h = src->info()->strides_in_bytes()[idx_height]; + + const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + if(border_mode == BorderMode::CONSTANT) + { + const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>()); + execute_window_loop(window, [&](const Coordinates & id) + { + const int32_t index_h = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset); + const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr()); + + const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? + (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) : + const_border_value; + const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? + (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) : + const_border_value; + const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ? + (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) : + const_border_value; + const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ? + (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) : + const_border_value; + + const float inp00 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info); + *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + in, out); + } + else if(border_mode == BorderMode::REPLICATE) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const int index_h = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset); + const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr()); + + auto clamped_w = utility::clamp<int>(index_w, 0, in_dim_w - 1); + auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1); + auto clamped_h = utility::clamp<int>(index_h, 0, in_dim_h - 1); + auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1); + + const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h); + const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h); + const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h); + const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h); + + const float inp00 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info); + *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + in, out); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} +} +namespace cpu +{ +void qasymm8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + if(policy == InterpolationPolicy::BILINEAR) + { + qasymm8_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + } + else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + qasymm8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); + } +} +} // namespace cpu +} // namespace arm_compute + +#endif // defined(ARM_COMPUTE_ENABLE_SVE)
\ No newline at end of file diff --git a/src/cpu/kernels/scale/sve/qasymm8_signed.cpp b/src/cpu/kernels/scale/sve/qasymm8_signed.cpp new file mode 100644 index 0000000000..0843e61fd4 --- /dev/null +++ b/src/cpu/kernels/scale/sve/qasymm8_signed.cpp @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/core/helpers/ScaleHelpers.h" +#include "src/core/helpers/ScaleHelpers.h" +#include "src/core/utils/ScaleUtils.h" +#include "support/Rounding.h" + +#include <arm_sve.h> +#include <cmath> +#include <cstddef> + +namespace arm_compute +{ +namespace +{ +void qasymm8_signed_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets, + float sampling_offset, bool align_corners, const Window &window) +{ + const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right; + const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom; + const size_t in_stride_wc = in_stride_w * in_stride_c; + const size_t in_dim_h = src->info()->dimension(2); + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners); + const auto window_start_x = static_cast<int32_t>(window.x().start()); + const auto window_end_x = static_cast<int32_t>(window.x().end()); + + Window win(window); + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator out(dst, win); + + const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes(); + const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3]; + + execute_window_loop(win, [&](const Coordinates & id) + { + const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c; + const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr)); + const int offset_row = in_hi * in_stride_wc; + const auto in_ptr = reinterpret_cast<const int8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]); + const auto out_ptr = reinterpret_cast<int8_t *>(out.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b8(x, window_end_x); + do + { + // Store results + svst1_s8(pg, out_ptr + x, svld1_s8(pg, in_ptr + offset + offset_row + x)); + + x += svcntw(); + pg = svwhilelt_b8(x, window_end_x); + } + while(svptest_any(svptrue_b8(), pg)); + }, + out); +} + +void qasymm8_signed_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + // Data layout is NHWC + const int idx_width = 1; + const int idx_height = 2; + + // Compute the ratio between source height and destination height + const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), align_corners); + Window win_off; + win_off.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_off.set(Window::DimY, Window::Dimension(0, 0, 0)); + + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(idx_width, Window::Dimension(0, 0, 0)); + win_in.set(idx_height, Window::Dimension(0, 0, 0)); + + for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d) + { + win_off.set(d, Window::Dimension(0, 0, 0)); + } + + Iterator in(src, win_in); + Iterator out(dst, window); + + const int32_t in_dim_w = src->info()->dimension(idx_width); + const int32_t in_dim_h = src->info()->dimension(idx_height); + const int32_t stride_w = src->info()->strides_in_bytes()[idx_width]; + const int32_t stride_h = src->info()->strides_in_bytes()[idx_height]; + + const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + if(border_mode == BorderMode::CONSTANT) + { + const int8_t const_border_value = static_cast<int8_t>(constant_border_value.get<int8_t>()); + execute_window_loop(window, [&](const Coordinates & id) + { + const int32_t index_h = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset); + const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr()); + + const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? + (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) : + const_border_value; + const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? + (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) : + const_border_value; + const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ? + (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) : + const_border_value; + const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ? + (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) : + const_border_value; + + const float inp00 = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info); + *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + in, out); + } + else if(border_mode == BorderMode::REPLICATE) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const int index_h = std::floor((id[idx_height] + sampling_offset) * hr - sampling_offset); + const int32_t index_w = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height])))); + const auto pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr()); + + auto clamped_w = utility::clamp<int>(index_w, 0, in_dim_w - 1); + auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1); + auto clamped_h = utility::clamp<int>(index_h, 0, in_dim_h - 1); + auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1); + + const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h); + const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h); + const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h); + const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h); + + const float inp00 = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info); + const float inp01 = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info); + const float inp10 = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info); + const float inp11 = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info); + *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info); + }, + in, out); + } + else + { + ARM_COMPUTE_ERROR("Not implemented"); + } +} +} +namespace cpu +{ +void qasymm8_signed_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, + InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, + bool align_corners, const Window &window) +{ + if(policy == InterpolationPolicy::BILINEAR) + { + qasymm8_signed_sve_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window); + } + else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + qasymm8_signed_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window); + } +} +} // namespace cpu +} // namespace arm_compute + +#endif // ARM_COMPUTE_ENABLE_SVE
\ No newline at end of file diff --git a/src/cpu/kernels/softmax/impl/neon/list.h b/src/cpu/kernels/softmax/impl/neon/list.h new file mode 100644 index 0000000000..5ebee31272 --- /dev/null +++ b/src/cpu/kernels/softmax/impl/neon/list.h @@ -0,0 +1,388 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H +#define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H + +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "support/SaturateCast.h" + +namespace arm_compute +{ +namespace cpu +{ +template <typename T> +void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) +{ + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + constexpr int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win{ window }; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator input(in, win); + Iterator output(out, win); + + const int sum_stages = log2(window_step_x / 2); + execute_window_loop(win, [&](const Coordinates &) + { + // Get pointers + const auto in_ptr = reinterpret_cast<const T *>(input.ptr()); + const auto out_ptr = reinterpret_cast<T *>(output.ptr()); + + // Init max value + auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{}); + int x = window_start_x; + + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto current_value = wrapper::vloadq(in_ptr + x); + vec_max = wrapper::vmax(vec_max, current_value); + } + auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max)); + + for(int i = 0; i < sum_stages; ++i) + { + carry_max = wrapper::vpmax(carry_max, carry_max); + } + T max_val = wrapper::vgetlane(carry_max, 0); + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val; + } + + *out_ptr = max_val; + }, + input, output); +} + +template <typename T> +void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, + ITensor *out, float beta, bool is_log, const Window &window) +{ + static_assert(std::is_same<T, qasymm8_t>::value + || std::is_same<T, qasymm8_signed_t>::value, + "quantized type should be either qasymm8_t or qasymm8_signed_t."); + + const int start_x = in->info()->valid_region().anchor.x(); + const int input_width = in->info()->valid_region().shape.x(); + + const float scale_beta = -beta * in->info()->quantization_info().uniform().scale; + const auto scale_beta_vec = vdupq_n_f32(scale_beta); + + Iterator in_it(in, window); + Iterator max_it(max, window); + Iterator out_it(out, window); + constexpr int vec_size = 16; + + execute_window_loop(window, [&](const Coordinates &) + { + /* Get pointers */ + const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast<float *>(tmp); + + float sum{}; + float sum_inversed{}; + + /* Compute exponentials and sum */ + { + /* Get max value */ + const auto max_val = *reinterpret_cast<const T *>(max_it.ptr()); + const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{}); + + /* Init sum to zero */ + float32x4x4_t vec_sum = + { + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + vdupq_n_f32(0.f), + }; + + /* Loop over row and compute exponentials and sum */ + int x = 0; + for(; x <= (input_width - vec_size); x += vec_size) + { + auto vec_elements = wrapper::vloadq(in_ptr + x); + vec_elements = wrapper::vqsub(vec_max, vec_elements); + auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements); + + if(is_log) + { + vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec); + vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec); + vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec); + vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec); + vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0])); + vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1])); + vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2])); + vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3])); + } + else + { + vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec)); + vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec)); + vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec)); + vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec)); + vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]); + vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]); + vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]); + vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]); + } + + vst4q_f32(tmp_ptr + x, vec_elements_flt); + } + + /* Reduce sum */ + const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3])); + auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte)); + sum_res = vpadd_f32(sum_res, sum_res); + sum = wrapper::vgetlane(sum_res, 0); + + /* Run remaining elements */ + for(; x < input_width; ++x) + { + float element{}; + if(is_log) + { + element = (max_val - in_ptr[x]) * scale_beta; + sum += std::exp(element); + } + else + { + element = std::exp((max_val - in_ptr[x]) * scale_beta); + sum += element; + } + + tmp_ptr[x] = element; + } + + if(!is_log) + { + sum_inversed = 256.f / sum; + } + else + { + sum = std::log(sum); + } + } + + /* Normalize exponentials */ + { + constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value; + /* Loop over row and compute softmax */ + int x = 0; + for(; x <= (input_width - vec_size); x += vec_size) + { + using int_vec_type = wrapper::traits::neon_vector_t<T, 16>; + float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x); + int_vec_type normalized_value{}; + if(is_log) + { + const float32x4x4_t sub = + { + vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)), + vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)), + vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)), + vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)), + }; + normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub); + } + else + { + float32x4x4_t mul = + { + vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)), + vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)), + vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)), + vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)), + }; + + if(is_qasymm8_signed) + { + const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{}); + mul.val[0] = wrapper::vsub(mul.val[0], offset_vec); + mul.val[1] = wrapper::vsub(mul.val[1], offset_vec); + mul.val[2] = wrapper::vsub(mul.val[2], offset_vec); + mul.val[3] = wrapper::vsub(mul.val[3], offset_vec); + } + + normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul); + } + wrapper::vstore(out_ptr + x, normalized_value); + } + /* Run remaining elements */ + for(; x < input_width; ++x) + { + if(is_log) + { + out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum); + } + else + { + out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0)); + } + } + } + }, + in_it, max_it, out_it); +} + +template <typename T> +void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, + ITensor *out, const float beta, bool is_log, const Window &window) +{ + const int start_x = in->info()->valid_region().anchor.x(); + const int input_width = in->info()->valid_region().shape.x(); + + Iterator in_it(in, window); + Iterator max_it(max, window); + Iterator out_it(out, window); + + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + constexpr int vec_size = 16 / sizeof(T); + const int sum_stages = log2(vec_size / 2); + + execute_window_loop(window, [&](const Coordinates &) + { + /* Get pointers */ + const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast<T *>(tmp); + + T sum{}; + T sum_inversed{}; + + /* Compute exponentials and sum */ + { + /* Get max value */ + const auto max_val = *reinterpret_cast<const T *>(max_it.ptr()); + const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{}); + + /* Init sum to zero */ + auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); + + /* Loop over row and compute exponentials and sum */ + int x = 0; + for(; x <= (input_width - vec_size); x += vec_size) + { + auto vec_elements = wrapper::vloadq(in_ptr + x); + vec_elements = wrapper::vsub(vec_elements, vec_max); + if(is_log) + { + vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})); + vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements)); + } + else + { + vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}))); + vec_sum = wrapper::vadd(vec_sum, vec_elements); + } + wrapper::vstore(tmp_ptr + x, vec_elements); + } + + /* Reduce sum */ + auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum)); + for(int i = 0; i < sum_stages; ++i) + { + sum_res = wrapper::vpadd(sum_res, sum_res); + } + sum = wrapper::vgetlane(sum_res, 0); + + /* Run remaining elements */ + for(; x < input_width; ++x) + { + T element{}; + + if(is_log) + { + element = (in_ptr[x] - max_val) * beta; + sum += std::exp(element); + } + else + { + element = std::exp((in_ptr[x] - max_val) * beta); + sum += element; + } + tmp_ptr[x] = element; + } + + if(!is_log) + { + sum_inversed = T(1) / sum; + } + else + { + sum = static_cast<T>(std::log(sum)); + } + } + + /* Normalize exponentials */ + { + /* Loop over row and compute softmax */ + int x = 0; + for(; x <= (input_width - vec_size); x += vec_size) + { + auto vec_in = wrapper::vloadq(tmp_ptr + x); + auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{}); + if(is_log) + { + normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{})); + } + else + { + normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{})); + } + wrapper::vstore(out_ptr + x, normalized_value); + } + /* Run remaining elements */ + for(; x < input_width; ++x) + { + if(is_log) + { + out_ptr[x] = tmp_ptr[x] - sum; + } + else + { + out_ptr[x] = tmp_ptr[x] * sum_inversed; + } + } + } + }, + in_it, max_it, out_it); +} + +} // namespace cpu +} // namespace arm_compute + +#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H */ diff --git a/src/cpu/kernels/softmax/impl/sve/impl.cpp b/src/cpu/kernels/softmax/impl/sve/impl.cpp new file mode 100644 index 0000000000..7a577fd565 --- /dev/null +++ b/src/cpu/kernels/softmax/impl/sve/impl.cpp @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +template <typename ScalarType> +void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window) +{ + const auto all_true_pg = wrapper::svptrue<ScalarType>(); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + + Window win{ window }; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator input(in, win); + Iterator output(out, win); + + execute_window_loop(win, [&](const Coordinates &) + { + // Get pointers + const auto in_ptr = reinterpret_cast<const ScalarType *>(input.ptr()); + const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + + // Init max value + auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>()); + + int x = window_start_x; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto current_value = svld1(pg, in_ptr + x); + vec_max = svmax_m(pg, vec_max, current_value); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + + auto max_val = svmaxv(all_true_pg, vec_max); + + *out_ptr = max_val; + }, + input, output); +} + +template <typename ScalarType> +void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, + ITensor *out, const float beta, bool is_log, const Window &window) +{ + const int start_x = in->info()->valid_region().anchor.x(); + const int input_width = in->info()->valid_region().shape.x(); + + Iterator in_it(in, window); + Iterator max_it(max, window); + Iterator out_it(out, window); + + const auto all_true_pg = wrapper::svptrue<ScalarType>(); + + execute_window_loop(window, [&](const Coordinates &) + { + /* Get pointers */ + const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp); + + ScalarType sum{ 0 }; + + /* Compute exponentials and sum */ + { + /* Get max value */ + const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr()); + const auto vec_max = wrapper::svdup_n(max_val); + + /* Init sum to zero */ + auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0)); + + /* Loop over row and compute exponentials and sum */ + int x = 0; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); + do + { + auto vec_elements = svld1(pg, in_ptr + x); + vec_elements = svsub_z(pg, vec_elements, vec_max); + if(is_log) + { + vec_elements = svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast<ScalarType>(beta))); + vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements)); + } + else + { + vec_elements = wrapper::svexp_z(pg, svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast<ScalarType>(beta)))); + vec_sum = svadd_m(pg, vec_sum, vec_elements); + } + svst1(pg, tmp_ptr + x, vec_elements); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, input_width); + } + while(svptest_any(all_true_pg, pg)); + + /* Reduce sum */ + sum = svaddv(all_true_pg, vec_sum); + + if(is_log) + { + sum = static_cast<ScalarType>(std::log(sum)); + } + else + { + sum = ScalarType(1) / sum; + } + } + + /* Normalize exponentials */ + { + /* Loop over row and compute softmax */ + int x = 0; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); + do + { + auto vec_in = svld1(pg, tmp_ptr + x); + auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0)); + if(is_log) + { + normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum))); + } + else + { + normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum))); + } + svst1(pg, out_ptr + x, normalized_value); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, input_width); + } + while(svptest_any(all_true_pg, pg)); + } + }, + in_it, max_it, out_it); +} + +template void sve_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window); +template void sve_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, const Window &window); +template void sve_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window); +template void sve_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window); + +template void sve_softmax_logits_1d_float<float>(const ITensor *in, const ITensor *max, void *const tmp, + ITensor *out, const float beta, bool is_log, const Window &window); +template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in, const ITensor *max, void *const tmp, + ITensor *out, const float beta, bool is_log, const Window &window); +} // namespace cpu +} // namespace arm_compute +#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ diff --git a/src/cpu/kernels/softmax/impl/sve/list.h b/src/cpu/kernels/softmax/impl/sve/list.h new file mode 100644 index 0000000000..b4e1e1b186 --- /dev/null +++ b/src/cpu/kernels/softmax/impl/sve/list.h @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H +#define SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H + +#if defined(ARM_COMPUTE_ENABLE_SVE) +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/SVEMath.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include <arm_sve.h> + +namespace arm_compute +{ +namespace cpu +{ +template <typename ScalarType> +void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window); + +template <typename ScalarType> +void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, + ITensor *out, const float beta, bool is_log, const Window &window); + +#if defined(ARM_COMPUTE_ENABLE_SVE2) +template <typename ScalarType> +void sve_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, + ITensor *out, float beta, bool is_log, const Window &window) +{ + const int start_x = in->info()->valid_region().anchor.x(); + const int input_width = in->info()->valid_region().shape.x(); + + const float scale_beta = -beta * in->info()->quantization_info().uniform().scale; + const auto scale_beta_vec = svdup_n_f32(scale_beta); + + Iterator in_it(in, window); + Iterator max_it(max, window); + Iterator out_it(out, window); + const auto all_true_pg = wrapper::svptrue<ScalarType>(); + using SVEType = typename wrapper::traits::sve_vector<ScalarType>::type; + + const int inc_1 = static_cast<int>(svcntw()); + const int inc_2 = static_cast<int>(2 * svcntw()); + const int inc_3 = static_cast<int>(3 * svcntw()); + + execute_window_loop(window, [&](const Coordinates &) + { + /* Get pointers */ + const auto in_ptr = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x; + const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x; + const auto tmp_ptr = reinterpret_cast<float *>(tmp); + + float sum{}; + + /* Compute exponentials and sum */ + { + /* Get max value */ + const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr()); + const auto vec_max = wrapper::svdup_n(max_val); + + /* Init sum to zero */ + auto vec_sum_0 = svdup_n_f32(0.f); + auto vec_sum_1 = svdup_n_f32(0.f); + auto vec_sum_2 = svdup_n_f32(0.f); + auto vec_sum_3 = svdup_n_f32(0.f); + + /* Loop over row and compute exponentials and sum */ + int x = 0; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); + svbool_t pg_0 = svunpklo(svunpklo(pg)); + svbool_t pg_1 = svunpkhi(svunpklo(pg)); + svbool_t pg_2 = svunpklo(svunpkhi(pg)); + svbool_t pg_3 = svunpkhi(svunpkhi(pg)); + do + { + auto vec_elements = svld1(pg, in_ptr + x); + vec_elements = svsub_z(pg, vec_max, vec_elements); + + auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements))); + auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements))); + auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements))); + auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements))); + + if(is_log) + { + vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec); + vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec); + vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec); + vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec); + vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0)); + vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1)); + vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2)); + vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3)); + } + else + { + vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec)); + vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec)); + vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec)); + vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec)); + vec_sum_0 = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0); + vec_sum_1 = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1); + vec_sum_2 = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2); + vec_sum_3 = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3); + } + + svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0); + svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1); + svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2); + svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, input_width); + pg_0 = svunpklo(svunpklo(pg)); + pg_1 = svunpkhi(svunpklo(pg)); + pg_2 = svunpklo(svunpkhi(pg)); + pg_3 = svunpkhi(svunpkhi(pg)); + } + while(svptest_any(all_true_pg, pg)); + + /* Reduce sum */ + const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1), svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3)); + sum = svaddv_f32(all_true_pg, vec_sum); + + /* Run remaining elements */ + x = 0; + if(is_log) + { + sum = std::log(sum); + } + else + { + sum = 256.f / sum; + } + } + + /* Normalize exponentials */ + { + constexpr bool is_qasymm8_signed = std::is_same<ScalarType, qasymm8_signed_t>::value; + /* Loop over row and compute softmax */ + int x = 0; + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width); + svbool_t pg_0 = svunpklo(svunpklo(pg)); + svbool_t pg_1 = svunpkhi(svunpklo(pg)); + svbool_t pg_2 = svunpklo(svunpkhi(pg)); + svbool_t pg_3 = svunpkhi(svunpkhi(pg)); + do + { + auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x); + auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1); + auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2); + auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3); + + svfloat32_t res_0{}; + svfloat32_t res_1{}; + svfloat32_t res_2{}; + svfloat32_t res_3{}; + + if(is_log) + { + res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum)); + res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum)); + res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum)); + res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum)); + } + else + { + res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum)); + res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum)); + res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum)); + res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum)); + + if(is_qasymm8_signed) + { + const auto offset_vec = svdup_n_f32(128.f); + res_0 = svsub_z(pg_0, vec_in_0, offset_vec); + res_1 = svsub_z(pg_1, vec_in_1, offset_vec); + res_2 = svsub_z(pg_2, vec_in_2, offset_vec); + res_3 = svsub_z(pg_3, vec_in_3, offset_vec); + } + } + + // Store value + const auto out = convert_float_to_int<SVEType>(res_0, res_1, res_2, res_3); + svst1(pg, out_ptr + x, out); + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, input_width); + pg_0 = svunpklo(svunpklo(pg)); + pg_1 = svunpkhi(svunpklo(pg)); + pg_2 = svunpklo(svunpkhi(pg)); + pg_3 = svunpkhi(svunpkhi(pg)); + } + while(svptest_any(all_true_pg, pg)); + } + }, + in_it, max_it, out_it); +} +#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */ +} // namespace cpu +} // namespace arm_compute +#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */ + +#endif /* SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H */ diff --git a/src/cpu/kernels/sub/neon/list.h b/src/cpu/kernels/sub/neon/list.h new file mode 100644 index 0000000000..ac1346001a --- /dev/null +++ b/src/cpu/kernels/sub/neon/list.h @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CORE_NEON_KERNELS_SUB_LIST_H +#define SRC_CORE_NEON_KERNELS_SUB_LIST_H + +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +namespace cpu +{ +#define DECLARE_SUB_KERNEL(func_name) \ + void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) + +DECLARE_SUB_KERNEL(sub_qasymm8_neon); +DECLARE_SUB_KERNEL(sub_qasymm8_signed_neon); +DECLARE_SUB_KERNEL(sub_qsymm16_neon); + +#undef DECLARE_SUB_KERNEL + +template <typename T> +void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + /** SIMD vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>; + + bool is_sat = policy == ConvertPolicy::SATURATE; + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + constexpr int window_step_x = 16 / sizeof(T); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + + Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape())); + Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape())); + Iterator output(dst, window); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<T *>(output.ptr()); + + const T broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{}); + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x); + auto res = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v) : wrapper::vsub(broadcast_value_vec, non_broadcast_v); + if(is_broadcast_input_2) + { + res = wrapper::vmul(res, wrapper::vdup_n(static_cast<T>(-1), ExactTagType{})); + } + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto non_broadcast_v = *(non_broadcast_input_ptr + x); + auto res = is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v; + if(is_broadcast_input_2) + { + res = static_cast<T>(-1) * res; + } + + *(output_ptr + x) = res; + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<T *>(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto val1 = wrapper::vloadq(input1_ptr + x); + const auto val2 = wrapper::vloadq(input2_ptr + x); + const auto res = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2); + wrapper::vstore(output_ptr + x, res); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const auto val1 = *(input1_ptr + x); + const auto val2 = *(input2_ptr + x); + *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2; + } + }, + input1, input2, output); + } +} +} // namespace cpu +} // namespace arm_compute +#endif // SRC_CORE_NEON_KERNELS_SUB_LIST_H diff --git a/src/cpu/kernels/sub/neon/qasymm8.cpp b/src/cpu/kernels/sub/neon/qasymm8.cpp new file mode 100644 index 0000000000..8f4cd8bdbb --- /dev/null +++ b/src/cpu/kernels/sub/neon/qasymm8.cpp @@ -0,0 +1,230 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +void sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + + const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); + const float32x4_t voffseto = vdupq_n_f32(oq_info.offset); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale); + const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale); + const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset); + const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + const auto broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(static_cast<uint8_t>(broadcast_value), wrapper::traits::vector_128_tag{}); + + const float32x4x4_t bf = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2), + } + }; + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(non_broadcast_input_ptr + x); + + const float32x4x4_t af = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), + } + }; + + const int32x4x4_t rf = + { + { +#ifdef __aarch64_ + vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), +#else //__aarch64__ + vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), +#endif //__aarch64__ + } + }; + + const auto pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const auto pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale; + const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale; + *(output_ptr + x) = quantize_qasymm8(is_broadcast_input_2 ? afs - bfs : bfs - afs, dst->info()->quantization_info()); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale); + const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); + const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset); + const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset); + + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + + const float32x4x4_t af = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), + } + }; + + const float32x4x4_t bf = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2), + } + }; + + const int32x4x4_t rf = + { + { +#ifdef __aarch64__ + vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), +#else //__aarch64__ + vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), +#endif //__aarch64__ + } + }; + + const auto pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const auto pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale; + const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale; + + *(output_ptr + x) = quantize_qasymm8((afs - bfs), dst->info()->quantization_info()); + } + }, + input1, input2, output); + } +} + +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/sub/neon/qasymm8_signed.cpp b/src/cpu/kernels/sub/neon/qasymm8_signed.cpp new file mode 100644 index 0000000000..2c9e411743 --- /dev/null +++ b/src/cpu/kernels/sub/neon/qasymm8_signed.cpp @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +void sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + + const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); + const float32x4_t voffseto = vdupq_n_f32(oq_info.offset); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale); + const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale); + const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset); + const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + const auto broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr()); + const auto broadcast_value_vec = wrapper::vdup_n(static_cast<int8_t>(broadcast_value), wrapper::traits::vector_128_tag{}); + + const float32x4x4_t bf = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2), + } + }; + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(non_broadcast_input_ptr + x); + + const float32x4x4_t af = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), + } + }; + + const int32x4x4_t rf = + { + { +#ifdef __aarch64_ + vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), +#else //__aarch64__ + vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), +#endif //__aarch64__ + } + }; + + const auto pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const auto pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale; + const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale; + *(output_ptr + x) = quantize_qasymm8_signed(is_broadcast_input_2 ? afs - bfs : bfs - afs, dst->info()->quantization_info()); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale); + const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); + const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset); + const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset); + + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto a = wrapper::vloadq(input1_ptr + x); + const auto b = wrapper::vloadq(input2_ptr + x); + + const float32x4x4_t af = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1), + } + }; + + const float32x4x4_t bf = + { + { + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2), + vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2), + } + }; + + const int32x4x4_t rf = + { + { +#ifdef __aarch64__ + vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), + vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), +#else //__aarch64__ + vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)), + vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)), +#endif //__aarch64__ + } + }; + + const auto pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]))); + const auto pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3]))); + wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale; + const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale; + + *(output_ptr + x) = quantize_qasymm8_signed((afs - bfs), dst->info()->quantization_info()); + } + }, + input1, input2, output); + } +} +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file diff --git a/src/cpu/kernels/sub/neon/qsymm16.cpp b/src/cpu/kernels/sub/neon/qsymm16.cpp new file mode 100644 index 0000000000..4dfdc0e78c --- /dev/null +++ b/src/cpu/kernels/sub/neon/qsymm16.cpp @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/Traits.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window) +{ + ARM_COMPUTE_UNUSED(policy); + + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); + + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 8; + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); + + const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform(); + + const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale); + const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale); + const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale); + + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0; + const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform(); + const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform(); + + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr()); + const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value); + + const float32x4x2_t bf = + { + { + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2), + } + }; + const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale; + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x); + const float32x4x2_t af = + { + { + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1), + } + }; + + const int32x4x4_t rf = + { + { +#ifdef __aarch64__ + vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), +#else //__aarch64__ + vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), +#endif //__aarch64__ + } + }; + + const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); + vst1q_s16(output_ptr + x, pa); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale; + *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info); + } + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(src0, input1_win); + Iterator input2(src1, input2_win); + Iterator output(dst, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr()); + const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr()); + + // Compute S elements per iteration + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const int16x8_t a = vld1q_s16(input1_ptr + x); + const int16x8_t b = vld1q_s16(input2_ptr + x); + + const float32x4x2_t af = + { + { + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1), + } + }; + + const float32x4x2_t bf = + { + { + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2), + vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2), + } + }; + + const int32x4x2_t rf = + { + { +#ifdef __aarch64__ + vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), +#else //__aarch64__ + vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)), + vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)), +#endif //__aarch64__ + } + }; + + const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])); + vst1q_s16(output_ptr + x, pa); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale; + const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale; + *(output_ptr + x) = quantize_qsymm16((afs - bfs), dst->info()->quantization_info()); + } + }, + input1, input2, output); + } +} +} // namespace cpu +} // namespace arm_compute
\ No newline at end of file |