From afd38f0c617d6f89b2b4532c6c44f116617e2b6f Mon Sep 17 00:00:00 2001 From: Felix Thomasmathibalan Date: Wed, 27 Sep 2023 17:46:17 +0100 Subject: Apply clang-format on repository Code is formatted as per a revised clang format configuration file(not part of this delivery). Version 14.0.6 is used. Exclusion List: - files with .cl extension - files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...) And the following directories - compute_kernel_writer/validation/ - tests/ - include/ - src/core/NEON/kernels/convolution/ - src/core/NEON/kernels/arm_gemm/ - src/core/NEON/kernels/arm_conv/ - data/ There will be a follow up for formatting of .cl files and the files under tests/ and compute_kernel_writer/validation/. Signed-off-by: Felix Thomasmathibalan Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391 Benchmark: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gunes Bayir --- .../kernels/NEBatchNormalizationLayerKernel.cpp | 302 +-- .../NEON/kernels/NEBatchNormalizationLayerKernel.h | 21 +- .../NEON/kernels/NEBatchToSpaceLayerKernel.cpp | 127 +- src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h | 13 +- src/core/NEON/kernels/NEBitwiseAndKernel.cpp | 17 +- src/core/NEON/kernels/NEBitwiseNotKernel.cpp | 14 +- src/core/NEON/kernels/NEBitwiseOrKernel.cpp | 18 +- src/core/NEON/kernels/NEBitwiseXorKernel.cpp | 18 +- .../NEON/kernels/NEBoundingBoxTransformKernel.cpp | 68 +- .../NEON/kernels/NEBoundingBoxTransformKernel.h | 8 +- .../NEON/kernels/NEChannelShuffleLayerKernel.cpp | 97 +- src/core/NEON/kernels/NECol2ImKernel.h | 4 +- src/core/NEON/kernels/NECropKernel.cpp | 238 +-- src/core/NEON/kernels/NECropKernel.h | 19 +- .../NEON/kernels/NEDepthToSpaceLayerKernel.cpp | 76 +- src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp | 149 +- src/core/NEON/kernels/NEFFTDigitReverseKernel.h | 6 +- src/core/NEON/kernels/NEFFTRadixStageKernel.cpp | 594 +++--- src/core/NEON/kernels/NEFFTRadixStageKernel.h | 14 +- src/core/NEON/kernels/NEFFTScaleKernel.cpp | 21 +- src/core/NEON/kernels/NEFFTScaleKernel.h | 4 +- src/core/NEON/kernels/NEFillBorderKernel.cpp | 225 ++- src/core/NEON/kernels/NEFillBorderKernel.h | 11 +- .../kernels/NEFuseBatchNormalizationKernel.cpp | 244 +-- .../NEON/kernels/NEFuseBatchNormalizationKernel.h | 39 +- src/core/NEON/kernels/NEGatherKernel.cpp | 80 +- src/core/NEON/kernels/NEGatherKernel.h | 5 +- .../kernels/NEGenerateProposalsLayerKernel.cpp | 48 +- .../NEON/kernels/NEGenerateProposalsLayerKernel.h | 2 +- .../kernels/NEInstanceNormalizationLayerKernel.cpp | 57 +- .../kernels/NEInstanceNormalizationLayerKernel.h | 8 +- src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp | 59 +- src/core/NEON/kernels/NEL2NormalizeLayerKernel.h | 3 +- src/core/NEON/kernels/NELogicalKernel.cpp | 91 +- src/core/NEON/kernels/NELogicalKernel.h | 5 +- .../kernels/NEMeanStdDevNormalizationKernel.cpp | 54 +- .../NEON/kernels/NENormalizationLayerKernel.cpp | 144 +- src/core/NEON/kernels/NENormalizationLayerKernel.h | 8 +- src/core/NEON/kernels/NEPadLayerKernel.cpp | 106 +- src/core/NEON/kernels/NEPadLayerKernel.h | 13 +- src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp | 166 +- src/core/NEON/kernels/NEPriorBoxLayerKernel.h | 14 +- .../kernels/NEQLSTMLayerNormalizationKernel.cpp | 118 +- .../NEON/kernels/NEQLSTMLayerNormalizationKernel.h | 33 +- src/core/NEON/kernels/NEROIAlignLayerKernel.cpp | 79 +- src/core/NEON/kernels/NEROIAlignLayerKernel.h | 5 +- src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp | 85 +- src/core/NEON/kernels/NEROIPoolingLayerKernel.h | 8 +- src/core/NEON/kernels/NERangeKernel.cpp | 90 +- src/core/NEON/kernels/NERangeKernel.h | 1 + .../NEON/kernels/NEReductionOperationKernel.cpp | 1955 ++++++++++---------- src/core/NEON/kernels/NEReductionOperationKernel.h | 3 +- src/core/NEON/kernels/NEReorderKernel.cpp | 70 +- src/core/NEON/kernels/NEReorderKernel.h | 33 +- src/core/NEON/kernels/NEReorgLayerKernel.cpp | 56 +- src/core/NEON/kernels/NEReverseKernel.cpp | 98 +- src/core/NEON/kernels/NEReverseKernel.h | 3 +- src/core/NEON/kernels/NESelectKernel.cpp | 156 +- src/core/NEON/kernels/NESelectKernel.h | 2 +- .../NEON/kernels/NESpaceToBatchLayerKernel.cpp | 161 +- src/core/NEON/kernels/NESpaceToBatchLayerKernel.h | 20 +- .../NEON/kernels/NESpaceToDepthLayerKernel.cpp | 59 +- src/core/NEON/kernels/NESpaceToDepthLayerKernel.h | 1 + src/core/NEON/kernels/NEStackLayerKernel.cpp | 55 +- src/core/NEON/kernels/NEStackLayerKernel.h | 10 +- src/core/NEON/kernels/NEStridedSliceKernel.cpp | 115 +- src/core/NEON/kernels/NEStridedSliceKernel.h | 23 +- src/core/NEON/kernels/NETileKernel.cpp | 47 +- src/core/NEON/kernels/assembly/depthwise.hpp | 270 +-- .../NEON/kernels/assembly/depthwise_common.hpp | 106 +- src/core/NEON/kernels/assembly/pool_common.hpp | 71 +- src/core/NEON/kernels/assembly/pooling.hpp | 210 ++- src/core/NEON/kernels/assembly/premultiply.hpp | 17 +- src/core/NEON/kernels/assembly/winograd.hpp | 181 +- .../kernels/batchnormalization/impl/NEON/fp16.cpp | 166 +- .../kernels/batchnormalization/impl/NEON/fp32.cpp | 166 +- .../kernels/batchnormalization/impl/SVE/fp16.cpp | 115 +- .../kernels/batchnormalization/impl/SVE/fp32.cpp | 115 +- .../NEON/kernels/batchnormalization/impl/list.h | 6 +- .../kernels/detail/NEActivationFunctionDetail.h | 7 +- .../NEON/kernels/detail/NEColorConvertHelper.inl | 735 ++++---- .../NEON/kernels/detail/NEDirectConvolution3x3.h | 80 +- .../kernels/detail/NEDirectConvolutionDetail.h | 507 ++--- 83 files changed, 4997 insertions(+), 4251 deletions(-) (limited to 'src/core/NEON/kernels') diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp index 108b199df7..deb89996a9 100644 --- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp @@ -28,18 +28,17 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + +#include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEFixedPoint.h" -#include "src/core/NEON/NEMath.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - +#include "src/core/NEON/kernels/batchnormalization/impl/list.h" #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" -#include "src/core/NEON/kernels/batchnormalization/impl/list.h" -#include "src/core/common/Registrars.h" - #include namespace arm_compute @@ -52,8 +51,15 @@ struct BatchNormalizationSelectorData const CPUInfo &ci; }; using BatchNormalizationSelectorPtr = std::add_pointer::type; -using BatchNormalizationKernelPtr = std::add_pointer::type; +using BatchNormalizationKernelPtr = std::add_pointer::type; struct BatchNormalizationKernel { @@ -62,41 +68,32 @@ struct BatchNormalizationKernel BatchNormalizationKernelPtr ukernel; }; -static const BatchNormalizationKernel available_kernels[] = -{ +static const BatchNormalizationKernel available_kernels[] = { #if defined(ARM_COMPUTE_ENABLE_SVE) - { - "sve_fp16_batch_normalization", - [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, - REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_batch_normalization) - }, - { - "sve_fp32_batch_normalization", - [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, - REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization) - }, + {"sve_fp16_batch_normalization", + [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F16 && data.ci.has_sve(); }, + REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_batch_normalization)}, + {"sve_fp32_batch_normalization", + [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F32 && data.ci.has_sve(); }, + REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization)}, #endif /* !defined(ARM_COMPUTE_ENABLE_SVE) */ #if defined(ARM_COMPUTE_ENABLE_NEON) #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - { - "neon_fp16_batch_normalization", - [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_batch_normalization) - }, + {"neon_fp16_batch_normalization", + [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_batch_normalization)}, #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - { - "neon_fp32_batch_normalization", - [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization) - }, + {"neon_fp32_batch_normalization", + [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization)}, #endif /* !defined(ARM_COMPUTE_ENABLE_NEON) */ }; const BatchNormalizationKernel *get_implementation(const BatchNormalizationSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -104,25 +101,31 @@ const BatchNormalizationKernel *get_implementation(const BatchNormalizationSelec return nullptr; } -Status -validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta, const ITensorInfo *gamma, float epsilon, ActivationLayerInfo act_info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta, + const ITensorInfo *gamma, + float epsilon, + ActivationLayerInfo act_info) { ARM_COMPUTE_UNUSED(epsilon); - const auto *uk = get_implementation(BatchNormalizationSelectorData{ input->data_type(), CPUInfo::get() }); + const auto *uk = get_implementation(BatchNormalizationSelectorData{input->data_type(), CPUInfo::get()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); - if(act_info.enabled()) + if (act_info.enabled()) { ActivationLayerInfo::ActivationFunction act = act_info.activation(); - ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU - && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU - && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); + ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU && + act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && + act != + ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a()); } - if(nullptr != output) + if (nullptr != output) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -131,17 +134,18 @@ validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const IT ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var); - if(beta != nullptr) + if (beta != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta); } - if(gamma != nullptr) + if (gamma != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma); } - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index( + input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0)); return Status{}; } @@ -169,10 +173,12 @@ void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &win // Only compute denominator and constants once per feature map. int slice = -1; - const auto input_mean = reinterpret_cast(_mean->ptr_to_element(Coordinates(0, 0))); - const auto input_var = reinterpret_cast(_var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (_beta != nullptr) ? reinterpret_cast(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_mean = reinterpret_cast(_mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast(_var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = + (_gamma != nullptr) ? reinterpret_cast(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_beta = + (_beta != nullptr) ? reinterpret_cast(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr; T mean = static_cast(0); T var = static_cast(0); @@ -186,80 +192,83 @@ void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &win auto beta_vec = wrapper::vdup_n(beta, ExactTagType{}); auto denominator_vec = wrapper::vdup_n(denominator, ExactTagType{}); const auto epsilon_vec = wrapper::vdup_n(static_cast(_epsilon), ExactTagType{}); - execute_window_loop(win_to_use, [&](const Coordinates & id) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - if(slice != id.z()) + execute_window_loop( + win_to_use, + [&](const Coordinates &id) { - mean = input_mean[id.z()]; - var = input_var[id.z()]; - mean_vec = wrapper::vdup_n(mean, ExactTagType{}); - var_vec = wrapper::vdup_n(var, ExactTagType{}); - if(input_gamma != nullptr) - { - gamma = input_gamma[id.z()]; - gamma_vec = wrapper::vdup_n(gamma, ExactTagType{}); - } - if(input_beta != nullptr) + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + if (slice != id.z()) { - beta = input_beta[id.z()]; - beta_vec = wrapper::vdup_n(beta, ExactTagType{}); + mean = input_mean[id.z()]; + var = input_var[id.z()]; + mean_vec = wrapper::vdup_n(mean, ExactTagType{}); + var_vec = wrapper::vdup_n(var, ExactTagType{}); + if (input_gamma != nullptr) + { + gamma = input_gamma[id.z()]; + gamma_vec = wrapper::vdup_n(gamma, ExactTagType{}); + } + if (input_beta != nullptr) + { + beta = input_beta[id.z()]; + beta_vec = wrapper::vdup_n(beta, ExactTagType{}); + } + + // Calculate denominator + denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); + denominator = wrapper::vgetlane(denominator_vec, 0); + slice = id.z(); } - // Calculate denominator - denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); - denominator = wrapper::vgetlane(denominator_vec, 0); - slice = id.z(); - } - - // Perform core calculations using vector operations - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - // Calculate x bar - const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec); - const auto x_bar = wrapper::vmul(numerator, denominator_vec); - auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec); - - // Perform fused activation - if(fused_activation) + // Perform core calculations using vector operations + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - activation_functor(res); + // Calculate x bar + const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec); + const auto x_bar = wrapper::vmul(numerator, denominator_vec); + auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec); + + // Perform fused activation + if (fused_activation) + { + activation_functor(res); + } + + // Store results + wrapper::vstore(output_ptr + x, res); } - // Store results - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const T numerator = input_ptr[x] - mean; - const T x_bar = numerator * denominator; - T res = beta + x_bar * gamma; - - // Perform fused activation - if(fused_activation) + // Compute left-over elements + for (; x < window_end_x; ++x) { - activation_functor(res); + const T numerator = input_ptr[x] - mean; + const T x_bar = numerator * denominator; + T res = beta + x_bar * gamma; + + // Perform fused activation + if (fused_activation) + { + activation_functor(res); + } + + // Store results + *(output_ptr + x) = res; } - - // Store results - *(output_ptr + x) = res; - } - }, - input, output); + }, + input, output); } void NEBatchNormalizationLayerKernel::configure_non_fused() { - switch(_input->info()->data_type()) + switch (_input->info()->data_type()) { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw>; + _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw>; break; #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: @@ -274,23 +283,25 @@ void NEBatchNormalizationLayerKernel::configure_non_fused() void NEBatchNormalizationLayerKernel::configure_fused() { // NCHW Fused Batched Normalization with activation functions : FP32 - static std::map bn_fused_map_f32_nchw = - { - { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw> }, - { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw> }, - { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw> } - }; + static std::map bn_fused_map_f32_nchw = { + {ActivationLayerInfo::ActivationFunction::RELU, + &NEBatchNormalizationLayerKernel::batch_normalization_nchw>}, + {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + &NEBatchNormalizationLayerKernel::batch_normalization_nchw>}, + {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + &NEBatchNormalizationLayerKernel::batch_normalization_nchw>}}; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC // NCHW Fused Batched Normalization with activation functions : FP16 - static std::map bn_fused_map_f16_nchw = - { - { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw> }, - { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw> }, - { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw> } - }; + static std::map bn_fused_map_f16_nchw = { + {ActivationLayerInfo::ActivationFunction::RELU, + &NEBatchNormalizationLayerKernel::batch_normalization_nchw>}, + {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + &NEBatchNormalizationLayerKernel::batch_normalization_nchw>}, + {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + &NEBatchNormalizationLayerKernel::batch_normalization_nchw>}}; #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - switch(_input->info()->data_type()) + switch (_input->info()->data_type()) { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: @@ -307,22 +318,32 @@ void NEBatchNormalizationLayerKernel::configure_fused() } NEBatchNormalizationLayerKernel::NEBatchNormalizationLayerKernel() - : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(), _act_info() + : _func(nullptr), + _input(nullptr), + _output(nullptr), + _mean(nullptr), + _var(nullptr), + _gamma(nullptr), + _beta(nullptr), + _epsilon(), + _act_info() { } -void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output, - const ITensor *mean, const ITensor *var, - const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo act_info) +void NEBatchNormalizationLayerKernel::configure(ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, - mean->info(), var->info(), - (beta != nullptr) ? beta->info() : nullptr, - (gamma != nullptr) ? gamma->info() : nullptr, - epsilon, act_info)); + mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr, + (gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info)); _input = input; _output = input; @@ -334,16 +355,16 @@ void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output, _act_info = act_info; const bool run_in_place = (output == nullptr) || (output == input); - if(!run_in_place) + if (!run_in_place) { _output = output; } // Configure activation function to run const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW; - if(is_nchw) + if (is_nchw) { - if(_act_info.enabled()) + if (_act_info.enabled()) { configure_fused(); } @@ -357,17 +378,21 @@ void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output, Window win = calculate_max_window(*input->info(), Steps()); INEKernel::configure(win); - if(output != nullptr) + if (output != nullptr) { // Output auto initialization if not yet initialized auto_init_if_empty(*output->info(), *input->info()->clone()); } } -Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta, const ITensorInfo *gamma, - float epsilon, ActivationLayerInfo act_info) +Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta, + const ITensorInfo *gamma, + float epsilon, + ActivationLayerInfo act_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info)); @@ -382,13 +407,14 @@ void NEBatchNormalizationLayerKernel::run(const Window &window, const ThreadInfo ARM_COMPUTE_ERROR_ON(_func == nullptr && _input->info()->data_layout() == DataLayout::NCHW); const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW; - if(is_nchw) + if (is_nchw) { (this->*_func)(window); } else { - const auto *uk = get_implementation(BatchNormalizationSelectorData{ _input->info()->data_type(), CPUInfo::get() }); + const auto *uk = + get_implementation(BatchNormalizationSelectorData{_input->info()->data_type(), CPUInfo::get()}); uk->ukernel(_input, _output, _mean, _var, _beta, _gamma, _epsilon, _act_info, window); } } diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h index 0551ace30c..2e8ff0dc9a 100644 --- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h +++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -68,7 +69,13 @@ public: * @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. */ - void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta = nullptr, const ITensor *gamma = nullptr, float epsilon = 0.001f, + void configure(ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta = nullptr, + const ITensor *gamma = nullptr, + float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayerKernel * @@ -85,10 +92,14 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr, - float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta = nullptr, + const ITensorInfo *gamma = nullptr, + float epsilon = 0.001f, + ActivationLayerInfo act_info = ActivationLayerInfo()); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp index 83fb5f6f51..f299bb94a4 100644 --- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp +++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp @@ -27,8 +27,9 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -46,7 +47,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -54,7 +55,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf return Status{}; } -Status validate_arguments_static(const ITensorInfo *input, int block_shape_x, int block_shape_y, const ITensorInfo *output, const CropInfo &crop_info) +Status validate_arguments_static(const ITensorInfo *input, + int block_shape_x, + int block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); @@ -65,13 +70,14 @@ Status validate_arguments_static(const ITensorInfo *input, int block_shape_x, in const int idx_batch = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - const TensorShape expected_output_shape = compute_batch_to_space_shape(input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info); - const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape); + const TensorShape expected_output_shape = compute_batch_to_space_shape( + input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info); + const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output); } @@ -80,7 +86,13 @@ Status validate_arguments_static(const ITensorInfo *input, int block_shape_x, in } // namespace NEBatchToSpaceLayerKernel::NEBatchToSpaceLayerKernel() - : _input(nullptr), _block_shape(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _block_shape_x(), _block_shape_y(), _crop_info() + : _input(nullptr), + _block_shape(nullptr), + _output(nullptr), + _data_layout(DataLayout::UNKNOWN), + _block_shape_x(), + _block_shape_y(), + _crop_info() { } @@ -99,15 +111,18 @@ void NEBatchToSpaceLayerKernel::configure(const ITensor *input, const ITensor *b ICPPKernel::configure(win); } -void NEBatchToSpaceLayerKernel::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info) +void NEBatchToSpaceLayerKernel::configure( + const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - const TensorShape output_shape = compute_batch_to_space_shape(input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y); + const TensorShape output_shape = compute_batch_to_space_shape( + input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y); // Output auto initialization if not yet initialized auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info)); _input = input; _output = output; @@ -121,14 +136,19 @@ void NEBatchToSpaceLayerKernel::configure(const ITensor *input, int32_t block_sh ICPPKernel::configure(win); } -Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) +Status +NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output)); return Status{}; } -Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info) +Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, + int32_t block_shape_x, + int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output, crop_info)); @@ -141,7 +161,7 @@ void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); - if(_block_shape != nullptr) + if (_block_shape != nullptr) { // Retrieve the block shapes dynamically _block_shape_x = *(reinterpret_cast(_block_shape->ptr_to_element(0))); @@ -155,31 +175,32 @@ void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info int batch_id = 0; // Main loop for NCHW and NHWC - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { do { Iterator out(_output, slice_out); - execute_window_loop(slice_out, [&](const Coordinates & id) - { - - const int x = id.x(); - const int y = id.y(); - const int z = id.z(); - // Translate x, y to uncropped version - const int x_c = x + _crop_info.left; - const int y_c = y + _crop_info.top; - - const int in_batch = batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size; - const int in_x = x_c / _block_shape_x; - const int in_y = y_c / _block_shape_y; - Coordinates input_coords{ in_x, in_y, z, in_batch }; - memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); - }, - out); + execute_window_loop( + slice_out, + [&](const Coordinates &id) + { + const int x = id.x(); + const int y = id.y(); + const int z = id.z(); + // Translate x, y to uncropped version + const int x_c = x + _crop_info.left; + const int y_c = y + _crop_info.top; + + const int in_batch = + batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size; + const int in_x = x_c / _block_shape_x; + const int in_y = y_c / _block_shape_y; + Coordinates input_coords{in_x, in_y, z, in_batch}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); + }, + out); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } else { @@ -188,26 +209,28 @@ void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info do { Iterator out(_output, slice_out); - execute_window_loop(slice_out, [&](const Coordinates & id) - { - - const int x = id.y(); - const int y = id.z(); - - // Translate x, y to uncropped version - const int x_c = x + _crop_info.left; - const int y_c = y + _crop_info.top; - - const int in_batch = batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size; - const int in_x = x_c / _block_shape_x; - const int in_y = y_c / _block_shape_y; - Coordinates input_coords{ 0, in_x, in_y, in_batch }; - memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size * _input->info()->dimension(0)); - }, - out); + execute_window_loop( + slice_out, + [&](const Coordinates &id) + { + const int x = id.y(); + const int y = id.z(); + + // Translate x, y to uncropped version + const int x_c = x + _crop_info.left; + const int y_c = y + _crop_info.top; + + const int in_batch = + batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size; + const int in_x = x_c / _block_shape_x; + const int in_y = y_c / _block_shape_y; + Coordinates input_coords{0, in_x, in_y, in_batch}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), + element_size * _input->info()->dimension(0)); + }, + out); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h index 5eceee0904..d98ac621b0 100644 --- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h +++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEBATCHTOSPACELAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -68,7 +69,11 @@ public: * @param[out] output Tensor output. Data types supported: same as @p input * @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed */ - void configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info = CropInfo{}); + void configure(const ITensor *input, + int32_t block_shape_x, + int32_t block_shape_y, + ITensor *output, + const CropInfo &crop_info = CropInfo{}); /** Static function to check if given info will lead to a valid configuration of @ref NEBatchToSpaceLayerKernel * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -90,7 +95,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info = CropInfo{}); + static Status validate(const ITensorInfo *input, + int32_t block_shape_x, + int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info = CropInfo{}); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp index 677c5cddcc..a59bbd233b 100644 --- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp +++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp @@ -27,9 +27,10 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include #include @@ -55,8 +56,7 @@ inline void bitwise_and(const T *__restrict input1, const T *__restrict input2, } } // namespace -NEBitwiseAndKernel::NEBitwiseAndKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) +NEBitwiseAndKernel::NEBitwiseAndKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) { } @@ -86,8 +86,7 @@ void NEBitwiseAndKernel::configure(const ITensor *input1, const ITensor *input2, Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, - AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration), + update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration), AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), output_access); @@ -103,9 +102,7 @@ void NEBitwiseAndKernel::run(const Window &window, const ThreadInfo &info) Iterator input2(_input2, window); Iterator output(_output, window); - execute_window_loop(window, [&](const Coordinates &) - { - bitwise_and(input1.ptr(), input2.ptr(), output.ptr()); - }, - input1, input2, output); + execute_window_loop( + window, [&](const Coordinates &) { bitwise_and(input1.ptr(), input2.ptr(), output.ptr()); }, input1, + input2, output); } diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp index 19b1af690a..ecd181a7af 100644 --- a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp +++ b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -50,8 +51,7 @@ inline void bitwise_not_U8_U8(const uint8_t *__restrict input, uint8_t *__restri } } // namespace -NEBitwiseNotKernel::NEBitwiseNotKernel() - : _input(nullptr), _output(nullptr) +NEBitwiseNotKernel::NEBitwiseNotKernel() : _input(nullptr), _output(nullptr) { } @@ -77,7 +77,8 @@ void NEBitwiseNotKernel::configure(const ITensor *input, ITensor *output) // Configure kernel window Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), output_access); + update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), + output_access); INEKernel::configure(win); } @@ -90,9 +91,6 @@ void NEBitwiseNotKernel::run(const Window &window, const ThreadInfo &info) Iterator input(_input, window); Iterator output(_output, window); - execute_window_loop(window, [&](const Coordinates &) - { - bitwise_not_U8_U8(input.ptr(), output.ptr()); - }, - input, output); + execute_window_loop( + window, [&](const Coordinates &) { bitwise_not_U8_U8(input.ptr(), output.ptr()); }, input, output); } diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp index 08094fbfcf..4c906134aa 100644 --- a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp +++ b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -42,7 +43,8 @@ class Coordinates; namespace { -inline void bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output) +inline void +bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output) { const uint8x16_t val1 = vld1q_u8(input1); const uint8x16_t val2 = vld1q_u8(input2); @@ -51,8 +53,7 @@ inline void bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t } } // namespace -NEBitwiseOrKernel::NEBitwiseOrKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) +NEBitwiseOrKernel::NEBitwiseOrKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) { } @@ -82,8 +83,7 @@ void NEBitwiseOrKernel::configure(const ITensor *input1, const ITensor *input2, Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, - AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration), + update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration), AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), output_access); @@ -99,9 +99,7 @@ void NEBitwiseOrKernel::run(const Window &window, const ThreadInfo &info) Iterator input2(_input2, window); Iterator output(_output, window); - execute_window_loop(window, [&](const Coordinates &) - { - bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); - }, - input1, input2, output); + execute_window_loop( + window, [&](const Coordinates &) { bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); }, input1, + input2, output); } diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp index fc5b38b64f..dbbed2483c 100644 --- a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp +++ b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -42,7 +43,8 @@ class Coordinates; namespace { -inline void bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output) +inline void +bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output) { const uint8x16_t val1 = vld1q_u8(input1); const uint8x16_t val2 = vld1q_u8(input2); @@ -51,8 +53,7 @@ inline void bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t } } // namespace -NEBitwiseXorKernel::NEBitwiseXorKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr) +NEBitwiseXorKernel::NEBitwiseXorKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) { } @@ -82,7 +83,8 @@ void NEBitwiseXorKernel::configure(const ITensor *input1, const ITensor *input2, AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration), - AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), output_access); + AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), + output_access); INEKernel::configure(win); } @@ -96,9 +98,7 @@ void NEBitwiseXorKernel::run(const Window &window, const ThreadInfo &info) Iterator input2(_input2, window); Iterator output(_output, window); - execute_window_loop(window, [&](const Coordinates &) - { - bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); - }, - input1, input2, output); + execute_window_loop( + window, [&](const Coordinates &) { bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); }, input1, + input2, output); } diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp index 69bfd56ce0..cb869838e2 100644 --- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp +++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp @@ -27,8 +27,9 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Window.h" -#include "src/core/CPP/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/boundingboxtransform/list.h" @@ -45,7 +46,11 @@ struct BoundingBoxTransformSelectorData }; using BoundingBoxTransformSelctorPtr = std::add_pointer::type; -using BoundingBoxTransformUKernelPtr = std::add_pointer::type; +using BoundingBoxTransformUKernelPtr = std::add_pointer::type; struct BoundingBoxTransformKernel { @@ -54,26 +59,19 @@ struct BoundingBoxTransformKernel BoundingBoxTransformUKernelPtr ukernel; }; -static const BoundingBoxTransformKernel available_kernels[] = -{ - { - "fp32_neon_boundingboxtransform", - [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform) - }, +static const BoundingBoxTransformKernel available_kernels[] = { + {"fp32_neon_boundingboxtransform", + [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform)}, #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { - "fp16_neon_boundingboxtransform", - [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform) - }, + {"fp16_neon_boundingboxtransform", + [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform)}, #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #if defined(ARM_COMPUTE_ENABLE_NEON) - { - "qu16_neon_boundingboxtransform", - [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::QASYMM16; }, - REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_boundingboxtransform) - }, + {"qu16_neon_boundingboxtransform", + [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::QASYMM16; }, + REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_boundingboxtransform)}, #endif //defined(ARM_COMPUTE_ENABLE_NEON) }; @@ -85,9 +83,9 @@ static const BoundingBoxTransformKernel available_kernels[] = */ const BoundingBoxTransformKernel *get_implementation(const BoundingBoxTransformSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -95,7 +93,10 @@ const BoundingBoxTransformKernel *get_implementation(const BoundingBoxTransformS return nullptr; } -Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) +Status validate_arguments(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(boxes); @@ -108,7 +109,7 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(info.scale() <= 0); - if(boxes->data_type() == DataType::QASYMM16) + if (boxes->data_type() == DataType::QASYMM16) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(deltas, 1, DataType::QASYMM8); const UniformQuantizationInfo deltas_qinfo = deltas->quantization_info().uniform(); @@ -120,12 +121,12 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes, deltas); } - if(pred_boxes->total_size() > 0) + if (pred_boxes->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, deltas); ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2); - if(pred_boxes->data_type() == DataType::QASYMM16) + if (pred_boxes->data_type() == DataType::QASYMM16) { const UniformQuantizationInfo pred_qinfo = pred_boxes->quantization_info().uniform(); ARM_COMPUTE_RETURN_ERROR_ON(pred_qinfo.scale != 0.125f); @@ -142,13 +143,19 @@ NEBoundingBoxTransformKernel::NEBoundingBoxTransformKernel() { } -void NEBoundingBoxTransformKernel::configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info) +void NEBoundingBoxTransformKernel::configure(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info)); // Configure kernel window - auto_init_if_empty(*pred_boxes->info(), deltas->info()->clone()->set_data_type(boxes->info()->data_type()).set_quantization_info(boxes->info()->quantization_info())); + auto_init_if_empty(*pred_boxes->info(), deltas->info() + ->clone() + ->set_data_type(boxes->info()->data_type()) + .set_quantization_info(boxes->info()->quantization_info())); // Set instance variables _boxes = boxes; @@ -164,7 +171,10 @@ void NEBoundingBoxTransformKernel::configure(const ITensor *boxes, ITensor *pred INEKernel::configure(win); } -Status NEBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) +Status NEBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(boxes, pred_boxes, deltas, info)); return Status{}; @@ -176,7 +186,7 @@ void NEBoundingBoxTransformKernel::run(const Window &window, const ThreadInfo &i ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - const auto *uk = get_implementation(BoundingBoxTransformSelectorData{ _boxes->info()->data_type() }); + const auto *uk = get_implementation(BoundingBoxTransformSelectorData{_boxes->info()->data_type()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); uk->ukernel(_boxes, _pred_boxes, _deltas, _bbinfo, window); diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h index def827836c..3915994feb 100644 --- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h +++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h @@ -63,7 +63,8 @@ public: * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct. * */ - void configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info); + void + configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform * @@ -77,7 +78,10 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info); + static Status validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp index 64da1f2262..3b53b7055f 100644 --- a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp +++ b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -44,15 +45,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NCHW, DataLayout::NHWC); - const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)); + const unsigned int channels = + input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)); ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + num_groups == channels, + "Channel shuffling with same number of groups as number of channels would be inefficient"); ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels); // There cannot be more groups than channels - ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, + "The number of channels must be a multiple of the number of groups"); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -72,20 +77,22 @@ void channel_shuffle_nhwc(const ITensor *input, ITensor *output, unsigned int nu Iterator in(input, window); - execute_window_loop(window, [&](const Coordinates & id) - { - // Shuffle channel - const unsigned int curr_channel = id.x(); - const unsigned int group_id = curr_channel * rK; - const unsigned int r = group_id * K; - const unsigned int channel_id = curr_channel - r; - - // Calculate output coordinates - Coordinates out_coords = id; - out_coords.set(Window::DimX, channel_id * num_groups + group_id); - std::copy_n(in.ptr(), element_size, output->ptr_to_element(out_coords)); - }, - in); + execute_window_loop( + window, + [&](const Coordinates &id) + { + // Shuffle channel + const unsigned int curr_channel = id.x(); + const unsigned int group_id = curr_channel * rK; + const unsigned int r = group_id * K; + const unsigned int channel_id = curr_channel - r; + + // Calculate output coordinates + Coordinates out_coords = id; + out_coords.set(Window::DimX, channel_id * num_groups + group_id); + std::copy_n(in.ptr(), element_size, output->ptr_to_element(out_coords)); + }, + in); } void channel_shuffle_nchw(const ITensor *input, ITensor *output, unsigned int num_groups, const Window &window) { @@ -107,34 +114,35 @@ void channel_shuffle_nchw(const ITensor *input, ITensor *output, unsigned int nu Iterator in(input, win); - execute_window_loop(win, [&](const Coordinates & id) - { - // Shuffle channel - const unsigned int curr_channel = id.z(); - const unsigned int group_id = curr_channel * rK; - const unsigned int r = group_id * K; - const unsigned int channel_id = curr_channel - r; - - // Calculate output coordinates - Coordinates out_coords = id; - out_coords.set(Window::DimZ, channel_id * num_groups + group_id); - const uint8_t *input_ptr = in.ptr(); - uint8_t *output_ptr = output->ptr_to_element(out_coords); - - // Copy plane - for(unsigned int y = 0; y < height; ++y) + execute_window_loop( + win, + [&](const Coordinates &id) { - std::copy_n(input_ptr, row_size, output_ptr); - input_ptr += input_stride_y; - output_ptr += output_stride_y; - } - }, - in); + // Shuffle channel + const unsigned int curr_channel = id.z(); + const unsigned int group_id = curr_channel * rK; + const unsigned int r = group_id * K; + const unsigned int channel_id = curr_channel - r; + + // Calculate output coordinates + Coordinates out_coords = id; + out_coords.set(Window::DimZ, channel_id * num_groups + group_id); + const uint8_t *input_ptr = in.ptr(); + uint8_t *output_ptr = output->ptr_to_element(out_coords); + + // Copy plane + for (unsigned int y = 0; y < height; ++y) + { + std::copy_n(input_ptr, row_size, output_ptr); + input_ptr += input_stride_y; + output_ptr += output_stride_y; + } + }, + in); } } // namespace -NEChannelShuffleLayerKernel::NEChannelShuffleLayerKernel() - : _input(nullptr), _output(nullptr), _num_groups() +NEChannelShuffleLayerKernel::NEChannelShuffleLayerKernel() : _input(nullptr), _output(nullptr), _num_groups() { } @@ -158,7 +166,8 @@ void NEChannelShuffleLayerKernel::configure(const ITensor *input, ITensor *outpu INEKernel::configure(win); } -Status NEChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups) +Status +NEChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups)); return Status{}; @@ -170,7 +179,7 @@ void NEChannelShuffleLayerKernel::run(const Window &window, const ThreadInfo &in ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - switch(_input->info()->data_layout()) + switch (_input->info()->data_layout()) { case DataLayout::NHWC: channel_shuffle_nhwc(_input, _output, _num_groups, window); diff --git a/src/core/NEON/kernels/NECol2ImKernel.h b/src/core/NEON/kernels/NECol2ImKernel.h index 1976302036..bc6652fd30 100644 --- a/src/core/NEON/kernels/NECol2ImKernel.h +++ b/src/core/NEON/kernels/NECol2ImKernel.h @@ -24,10 +24,10 @@ #ifndef ARM_COMPUTE_NECOL2IMKERNEL_H #define ARM_COMPUTE_NECOL2IMKERNEL_H -#include "src/core/NEON/INEKernel.h" - #include "arm_compute/core/Size2D.h" +#include "src/core/NEON/INEKernel.h" + namespace arm_compute { class ITensor; diff --git a/src/core/NEON/kernels/NECropKernel.cpp b/src/core/NEON/kernels/NECropKernel.cpp index 94c455305c..60271fbc74 100644 --- a/src/core/NEON/kernels/NECropKernel.cpp +++ b/src/core/NEON/kernels/NECropKernel.cpp @@ -26,14 +26,15 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Window.h" #include "arm_compute/core/utils/helpers/tensor_transform.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Window.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/core/utils/helpers/bit_ops.h" #include "src/cpu/kernels/crop/list.h" @@ -47,7 +48,8 @@ struct CropSelectorData }; using CropSelectorPtr = std::add_pointer::type; -using CropUKernelPtr = std::add_pointer::type; +using CropUKernelPtr = std::add_pointer::type; struct CropUKernel { @@ -56,48 +58,23 @@ struct CropUKernel CropUKernelPtr ukernel; }; -static const CropUKernel available_kernels[] = -{ - { - "fp16_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::fp16_in_bounds_crop_window) - }, - { - "f32_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::fp32_in_bounds_crop_window) - }, - { - "u8_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::U8; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::u8_in_bounds_crop_window) - }, - { - "u16_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::U16; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::u16_in_bounds_crop_window) - }, - { - "u32_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::U32; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::u32_in_bounds_crop_window) - }, - { - "s8_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::S8; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s8_in_bounds_crop_window) - }, - { - "s16_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::S16; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s16_in_bounds_crop_window) - }, - { - "s32_neon_crop", - [](const CropSelectorData & data) { return data.dt == DataType::S32; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s32_in_bounds_crop_window) - }, +static const CropUKernel available_kernels[] = { + {"fp16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::fp16_in_bounds_crop_window)}, + {"f32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::fp32_in_bounds_crop_window)}, + {"u8_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u8_in_bounds_crop_window)}, + {"u16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U16; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u16_in_bounds_crop_window)}, + {"u32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U32; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u32_in_bounds_crop_window)}, + {"s8_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s8_in_bounds_crop_window)}, + {"s16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S16; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s16_in_bounds_crop_window)}, + {"s32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S32; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s32_in_bounds_crop_window)}, }; /** Micro-kernel selector @@ -108,9 +85,9 @@ static const CropUKernel available_kernels[] = */ const CropUKernel *get_implementation(const CropSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -119,26 +96,40 @@ const CropUKernel *get_implementation(const CropSelectorData &data) return nullptr; } -inline void out_of_bounds_crop_window(const ITensor *output, float *output_ptr, float extrapolation_value, - int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit) +inline void out_of_bounds_crop_window(const ITensor *output, + float *output_ptr, + float extrapolation_value, + int32_t window_step_x, + int32_t output_width_start, + int32_t output_width_limit) { - auto in = wrapper::vdup_n(extrapolation_value, wrapper::traits::vector_128_tag()); - int32_t x = 0; - int32_t limit = (output_width_limit - output_width_start) * static_cast(output->info()->dimension(0)); - float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0); - for(; x <= limit - window_step_x; x += window_step_x) + auto in = wrapper::vdup_n(extrapolation_value, wrapper::traits::vector_128_tag()); + int32_t x = 0; + int32_t limit = (output_width_limit - output_width_start) * static_cast(output->info()->dimension(0)); + float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0); + for (; x <= limit - window_step_x; x += window_step_x) { wrapper::vstore(output_start_ptr + x, in); } - for(; x < limit; ++x) + for (; x < limit; ++x) { *(output_start_ptr + x) = extrapolation_value; } } -inline void execute_window(const ITensor *input, const ITensor *output, Coordinates input_offset, float extrapolation_value, - const std::array &rows_out_of_bounds, const std::array &cols_out_of_bounds, NECropKernel::InBoundsCropFunction *in_bounds_crop_function, - bool is_height_flipped, bool has_cols_in_bounds, bool has_cols_out_of_bounds_before, bool has_cols_out_of_bounds_after, bool input_has_single_channel, bool is_width_flipped) +inline void execute_window(const ITensor *input, + const ITensor *output, + Coordinates input_offset, + float extrapolation_value, + const std::array &rows_out_of_bounds, + const std::array &cols_out_of_bounds, + NECropKernel::InBoundsCropFunction *in_bounds_crop_function, + bool is_height_flipped, + bool has_cols_in_bounds, + bool has_cols_out_of_bounds_before, + bool has_cols_out_of_bounds_after, + bool input_has_single_channel, + bool is_width_flipped) { // Output is always float. const int window_step_x = 16 / sizeof(float); @@ -159,45 +150,66 @@ inline void execute_window(const ITensor *input, const ITensor *output, Coordina // |------------------------------| // Fill all output rows that have no elements that are within the input bounds with the extrapolation value. // First for the rows before the in bounds rows. - out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[0] * output->info()->dimension(1)); + out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, + rows_out_of_bounds[0] * output->info()->dimension(1)); output_ptr += rows_out_of_bounds[0] * output->info()->dimension(1) * output->info()->dimension(0); // Iterate through each row that has any elements within the input bounds. - for(uint32_t row = rows_out_of_bounds[0]; static_cast(row) < static_cast(output->info()->dimension(2) - rows_out_of_bounds[1]); - ++row, is_height_flipped ? --input_offset[2] : ++input_offset[2]) + for (uint32_t row = rows_out_of_bounds[0]; + static_cast(row) < static_cast(output->info()->dimension(2) - rows_out_of_bounds[1]); + ++row, is_height_flipped ? --input_offset[2] : ++input_offset[2]) { // Fill all elements in the row that are out of bounds with the extrapolation value. // First for the elements before the in bounds elements. - if(has_cols_out_of_bounds_before) + if (has_cols_out_of_bounds_before) { out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, cols_out_of_bounds[0]); } // Copy all elements within the input bounds from the input tensor. - if(has_cols_in_bounds) + if (has_cols_in_bounds) { (*in_bounds_crop_function)(input, output, output_ptr, input_offset, window_step_x, cols_out_of_bounds[0], - output->info()->dimension(1) - cols_out_of_bounds[1], input_has_single_channel, is_width_flipped); + output->info()->dimension(1) - cols_out_of_bounds[1], input_has_single_channel, + is_width_flipped); } // Fill all elements after the in bounds elements with the extrapolation value. - if(has_cols_out_of_bounds_after) + if (has_cols_out_of_bounds_after) { - out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, output->info()->dimension(1) - cols_out_of_bounds[1], output->info()->dimension(1)); + out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, + output->info()->dimension(1) - cols_out_of_bounds[1], + output->info()->dimension(1)); } output_ptr += output->info()->dimension(1) * output->info()->dimension(0); } // Fill all rows after the in bounds elements with the extrapolation value. - out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[1] * output->info()->dimension(1)); + out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, + rows_out_of_bounds[1] * output->info()->dimension(1)); } } // namespace NECropKernel::NECropKernel() - : _input(nullptr), _crop_boxes(nullptr), _box_ind(nullptr), _output(nullptr), _start(), _end(), _crop_box_ind(0), _extrapolation_value(0), _rows_out_of_bounds(), _cols_out_of_bounds() + : _input(nullptr), + _crop_boxes(nullptr), + _box_ind(nullptr), + _output(nullptr), + _start(), + _end(), + _crop_box_ind(0), + _extrapolation_value(0), + _rows_out_of_bounds(), + _cols_out_of_bounds() { } -void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind, float extrapolation_value) +void NECropKernel::configure(const ITensor *input, + const ITensor *crop_boxes, + const ITensor *box_ind, + ITensor *output, + uint32_t crop_box_ind, + float extrapolation_value) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), crop_boxes->info(), box_ind->info(), output->info(), crop_box_ind, extrapolation_value)); + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), crop_boxes->info(), box_ind->info(), output->info(), + crop_box_ind, extrapolation_value)); _input = input; _crop_boxes = crop_boxes; @@ -207,21 +219,27 @@ void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, co _extrapolation_value = extrapolation_value; } -Status NECropKernel::validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind, float extrapolation_value) +Status NECropKernel::validate(const ITensorInfo *input, + const ITensorInfo *crop_boxes, + const ITensorInfo *box_ind, + const ITensorInfo *output, + uint32_t crop_box_ind, + float extrapolation_value) { ARM_COMPUTE_UNUSED(extrapolation_value); - const auto *uk = get_implementation(CropSelectorData{ input->data_type() }); + const auto *uk = get_implementation(CropSelectorData{input->data_type()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16, + DataType::F16, DataType::U32, DataType::S32, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC); ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[0] != 4); ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]); ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[1] <= crop_box_ind); ARM_COMPUTE_RETURN_ERROR_ON(box_ind->tensor_shape()[0] <= crop_box_ind); - if(output->total_size() > 0) + if (output->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -242,48 +260,53 @@ void NECropKernel::configure_output_shape() // The normalized coordiantes are scaled to retrieve the floating point image coordinates which are rounded to integers. _start = Coordinates(std::floor(x0 * (_input->info()->tensor_shape()[1] - 1) + 0.5f), std::floor(y0 * (_input->info()->tensor_shape()[2] - 1) + 0.5f)); - _end = Coordinates(std::floor(x1 * (_input->info()->tensor_shape()[1] - 1) + 0.5f), - std::floor(y1 * (_input->info()->tensor_shape()[2] - 1) + 0.5f)); - const TensorShape out_shape(_input->info()->tensor_shape()[0], abs(_end[0] - _start[0]) + 1, abs(_end[1] - _start[1]) + 1); + _end = Coordinates(std::floor(x1 * (_input->info()->tensor_shape()[1] - 1) + 0.5f), + std::floor(y1 * (_input->info()->tensor_shape()[2] - 1) + 0.5f)); + const TensorShape out_shape(_input->info()->tensor_shape()[0], abs(_end[0] - _start[0]) + 1, + abs(_end[1] - _start[1]) + 1); _output->info()->set_tensor_shape(out_shape); bool is_width_flipped = _end[0] < _start[0]; bool is_height_flipped = _end[1] < _start[1]; - if(is_height_flipped) + if (is_height_flipped) { - _rows_out_of_bounds[0] = _start[1] >= static_cast(_input->info()->dimension(2)) ? std::min(static_cast(_start[1] - _input->info()->dimension(2) + 1), - static_cast(_output->info()->dimension(2))) : - 0; + _rows_out_of_bounds[0] = _start[1] >= static_cast(_input->info()->dimension(2)) + ? std::min(static_cast(_start[1] - _input->info()->dimension(2) + 1), + static_cast(_output->info()->dimension(2))) + : 0; _rows_out_of_bounds[1] = _end[1] < 0 ? std::min(static_cast(-_end[1]), - static_cast(_output->info()->dimension(2))) : - 0; + static_cast(_output->info()->dimension(2))) + : 0; } else { _rows_out_of_bounds[0] = _start[1] < 0 ? std::min(static_cast(-_start[1]), - static_cast(_output->info()->dimension(2))) : - 0; - _rows_out_of_bounds[1] = _end[1] >= static_cast(_input->info()->dimension(2)) ? std::min(static_cast(_end[1] - _input->info()->dimension(2) + 1), - static_cast(_output->info()->dimension(2))) : - 0; + static_cast(_output->info()->dimension(2))) + : 0; + _rows_out_of_bounds[1] = _end[1] >= static_cast(_input->info()->dimension(2)) + ? std::min(static_cast(_end[1] - _input->info()->dimension(2) + 1), + static_cast(_output->info()->dimension(2))) + : 0; } - if(is_width_flipped) + if (is_width_flipped) { - _cols_out_of_bounds[0] = _start[0] >= static_cast(_input->info()->dimension(1)) ? std::min(static_cast(_start[0] - _input->info()->dimension(1) + 1), - static_cast(_output->info()->dimension(1))) : - 0; + _cols_out_of_bounds[0] = _start[0] >= static_cast(_input->info()->dimension(1)) + ? std::min(static_cast(_start[0] - _input->info()->dimension(1) + 1), + static_cast(_output->info()->dimension(1))) + : 0; _cols_out_of_bounds[1] = _end[0] < 0 ? std::min(static_cast(-_end[0]), - static_cast(_output->info()->dimension(1))) : - 0; + static_cast(_output->info()->dimension(1))) + : 0; } else { _cols_out_of_bounds[0] = _start[0] < 0 ? std::min(static_cast(-_start[0]), - static_cast(_output->info()->dimension(1))) : - 0; - _cols_out_of_bounds[1] = _end[0] >= static_cast(_input->info()->dimension(1)) ? std::min(static_cast(_end[0] - _input->info()->dimension(1) + 1), - static_cast(_output->info()->dimension(1))) : - 0; + static_cast(_output->info()->dimension(1))) + : 0; + _cols_out_of_bounds[1] = _end[0] >= static_cast(_input->info()->dimension(1)) + ? std::min(static_cast(_end[0] - _input->info()->dimension(1) + 1), + static_cast(_output->info()->dimension(1))) + : 0; } INEKernel::configure(calculate_max_window(*_output->info())); @@ -298,13 +321,18 @@ void NECropKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON(_input->info()->has_padding()); ARM_COMPUTE_ERROR_ON(_output->info()->has_padding()); - const auto *uk = get_implementation(CropSelectorData{ _input->info()->data_type() }); + const auto *uk = get_implementation(CropSelectorData{_input->info()->data_type()}); uint32_t batch_index = *(reinterpret_cast(_box_ind->ptr_to_element(Coordinates(_crop_box_ind)))); - Coordinates input_offset(0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0], - _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index); - execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, uk->ukernel, _end[1] < _start[1], - _cols_out_of_bounds[0] + _cols_out_of_bounds[1] < _output->info()->dimension(1), _cols_out_of_bounds[0] > 0, _cols_out_of_bounds[1] > 0, + Coordinates input_offset( + 0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0], + _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index); + execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, + uk->ukernel, + _end[1]<_start[1], + _cols_out_of_bounds[0] + + _cols_out_of_bounds[1]<_output->info()->dimension(1), _cols_out_of_bounds[0]> 0, + _cols_out_of_bounds[1]> 0, _start[0] <= _end[0], _end[0] < _start[0]); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NECropKernel.h b/src/core/NEON/kernels/NECropKernel.h index 6c989c1d2c..da4a1b26e5 100644 --- a/src/core/NEON/kernels/NECropKernel.h +++ b/src/core/NEON/kernels/NECropKernel.h @@ -25,7 +25,7 @@ #define ARM_COMPUTE_NEON_CROP_KERNEL_H #include "arm_compute/core/Types.h" -#include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -67,7 +67,12 @@ public: * @param[in] crop_box_ind Index of the crop box to be used from @p crop_boxes. Default is 0. * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0. */ - void configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0); + void configure(const ITensor *input, + const ITensor *crop_boxes, + const ITensor *box_ind, + ITensor *output, + uint32_t crop_box_ind = 0, + float extrapolation_value = 0); /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel * @@ -82,7 +87,12 @@ public: * @param[in] crop_box_ind Index of the crop box to be used from @p crop_boxes. Default is 0. * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0. */ - static Status validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0); + static Status validate(const ITensorInfo *input, + const ITensorInfo *crop_boxes, + const ITensorInfo *box_ind, + const ITensorInfo *output, + uint32_t crop_box_ind = 0, + float extrapolation_value = 0); /** Configure output tensor's shape as this can only be determined at runtime. */ void configure_output_shape(); @@ -91,7 +101,8 @@ public: void run(const Window &window, const ThreadInfo &info) override; /** Function to use for in bounds crop for the particular tensor types passed to configure() */ - using InBoundsCropFunction = void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool); + using InBoundsCropFunction = + void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool); private: const ITensor *_input; diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp index 6dcc85ec2e..de0079ee60 100644 --- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp @@ -26,11 +26,12 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include #include @@ -52,12 +53,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != 0); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape * input->tensor_shape()[idx_width])); - ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape * input->tensor_shape()[idx_height])); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != + (block_shape * input->tensor_shape()[idx_width])); + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != + (block_shape * input->tensor_shape()[idx_height])); ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -74,7 +77,8 @@ NEDepthToSpaceLayerKernel::NEDepthToSpaceLayerKernel() void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output, int32_t block_shape) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - TensorShape output_shape = compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape); + TensorShape output_shape = + compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape); // Output auto inizialitation if not yet initialized auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); @@ -117,26 +121,27 @@ void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); // Main loop for NCHW and NHWC - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { Window slice_in = window.first_slice_window_2D(); do { Iterator in(_input, slice_in); - execute_window_loop(slice_in, [&](const Coordinates & id) - { - const int x = id.x(); - const int y = id.y(); - - const int z = id.z() % r; - const int out_x = x * _block_shape + (id.z() / r) % _block_shape; - const int out_y = y * _block_shape + (id.z() / r) / _block_shape; - Coordinates output_coords{ out_x, out_y, z, id[3] }; - memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); - }, - in); - } - while(window.slide_window_slice_2D(slice_in)); + execute_window_loop( + slice_in, + [&](const Coordinates &id) + { + const int x = id.x(); + const int y = id.y(); + + const int z = id.z() % r; + const int out_x = x * _block_shape + (id.z() / r) % _block_shape; + const int out_y = y * _block_shape + (id.z() / r) / _block_shape; + Coordinates output_coords{out_x, out_y, z, id[3]}; + memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); + }, + in); + } while (window.slide_window_slice_2D(slice_in)); } else { @@ -144,20 +149,21 @@ void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info do { Iterator in(_input, slice_in); - execute_window_loop(slice_in, [&](const Coordinates & id) - { - const int x = id.y(); - const int y = id.z(); - - const int z = id.x() % r; - const int out_x = x * _block_shape + (id.x() / r) % _block_shape; - const int out_y = y * _block_shape + (id.x() / r) / _block_shape; - Coordinates output_coords{ z, out_x, out_y, id[3] }; - memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); - }, - in); - } - while(window.slide_window_slice_3D(slice_in)); + execute_window_loop( + slice_in, + [&](const Coordinates &id) + { + const int x = id.y(); + const int y = id.z(); + + const int z = id.x() % r; + const int out_x = x * _block_shape + (id.x() / r) % _block_shape; + const int out_y = y * _block_shape + (id.x() / r) / _block_shape; + Coordinates output_coords{z, out_x, out_y, id[3]}; + memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size); + }, + in); + } while (window.slide_window_slice_3D(slice_in)); } } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp index 261437f07d..a5969cd497 100644 --- a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp +++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -37,16 +38,19 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32); - ARM_COMPUTE_RETURN_ERROR_ON(std::set({ 0, 1 }).count(config.axis) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(std::set({0, 1}).count(config.axis) == 0); ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x()); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); @@ -56,7 +60,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config) +std::pair validate_and_configure_window(ITensorInfo *input, + ITensorInfo *output, + ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_UNUSED(idx, config); @@ -68,12 +75,14 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen } } // namespace -NEFFTDigitReverseKernel::NEFFTDigitReverseKernel() - : _func(nullptr), _input(nullptr), _output(nullptr), _idx(nullptr) +NEFFTDigitReverseKernel::NEFFTDigitReverseKernel() : _func(nullptr), _input(nullptr), _output(nullptr), _idx(nullptr) { } -void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, const ITensor *idx, const FFTDigitReverseKernelInfo &config) +void NEFFTDigitReverseKernel::configure(const ITensor *input, + ITensor *output, + const ITensor *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config)); @@ -91,11 +100,11 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c ARM_COMPUTE_ERROR_THROW_ON(win_config.first); INEKernel::configure(win_config.second); - if(axis == 0) + if (axis == 0) { - if(is_input_complex) + if (is_input_complex) { - if(is_conj) + if (is_conj) { _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0; } @@ -109,11 +118,11 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0; } } - else if(axis == 1) + else if (axis == 1) { - if(is_input_complex) + if (is_input_complex) { - if(is_conj) + if (is_conj) { _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1; } @@ -133,10 +142,14 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c } } -Status NEFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config) +Status NEFFTDigitReverseKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first); return Status{}; } @@ -159,38 +172,40 @@ void NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0(const Window &window) std::vector buffer_row_out(2 * N); std::vector buffer_row_in(2 * N); - execute_window_loop(slice, [&](const Coordinates &) - { - if(is_input_complex) + execute_window_loop( + slice, + [&](const Coordinates &) { - // Load - memcpy(buffer_row_in.data(), reinterpret_cast(in.ptr()), 2 * N * sizeof(float)); - - // Shuffle - for(size_t x = 0; x < 2 * N; x += 2) + if (is_input_complex) { - size_t idx = buffer_idx[x / 2]; - buffer_row_out[x] = buffer_row_in[2 * idx]; - buffer_row_out[x + 1] = (is_conj ? -buffer_row_in[2 * idx + 1] : buffer_row_in[2 * idx + 1]); - } - } - else - { - // Load - memcpy(buffer_row_in.data(), reinterpret_cast(in.ptr()), N * sizeof(float)); + // Load + memcpy(buffer_row_in.data(), reinterpret_cast(in.ptr()), 2 * N * sizeof(float)); - // Shuffle - for(size_t x = 0; x < N; ++x) + // Shuffle + for (size_t x = 0; x < 2 * N; x += 2) + { + size_t idx = buffer_idx[x / 2]; + buffer_row_out[x] = buffer_row_in[2 * idx]; + buffer_row_out[x + 1] = (is_conj ? -buffer_row_in[2 * idx + 1] : buffer_row_in[2 * idx + 1]); + } + } + else { - size_t idx = buffer_idx[x]; - buffer_row_out[2 * x] = buffer_row_in[idx]; + // Load + memcpy(buffer_row_in.data(), reinterpret_cast(in.ptr()), N * sizeof(float)); + + // Shuffle + for (size_t x = 0; x < N; ++x) + { + size_t idx = buffer_idx[x]; + buffer_row_out[2 * x] = buffer_row_in[idx]; + } } - } - // Copy back - memcpy(reinterpret_cast(out.ptr()), buffer_row_out.data(), 2 * N * sizeof(float)); - }, - in, out); + // Copy back + memcpy(reinterpret_cast(out.ptr()), buffer_row_out.data(), 2 * N * sizeof(float)); + }, + in, out); } template @@ -215,39 +230,41 @@ void NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1(const Window &window) const size_t stride_z = _input->info()->strides_in_bytes()[2]; const size_t stride_w = _input->info()->strides_in_bytes()[3]; - execute_window_loop(slice, [&](const Coordinates & id) - { - auto *out_ptr = reinterpret_cast(out.ptr()); - auto *in_ptr = reinterpret_cast(_input->buffer() + id.z() * stride_z + id[3] * stride_w); - const size_t y_shuffled = buffer_idx[id.y()]; - - if(is_input_complex) + execute_window_loop( + slice, + [&](const Coordinates &id) { - // Shuffle the entire row into the output - memcpy(out_ptr, in_ptr + 2 * Nx * y_shuffled, 2 * Nx * sizeof(float)); + auto *out_ptr = reinterpret_cast(out.ptr()); + auto *in_ptr = reinterpret_cast(_input->buffer() + id.z() * stride_z + id[3] * stride_w); + const size_t y_shuffled = buffer_idx[id.y()]; - // Conjugate if necessary - if(is_conj) + if (is_input_complex) { - for(size_t x = 0; x < 2 * Nx; x += 2) + // Shuffle the entire row into the output + memcpy(out_ptr, in_ptr + 2 * Nx * y_shuffled, 2 * Nx * sizeof(float)); + + // Conjugate if necessary + if (is_conj) { - out_ptr[x + 1] = -out_ptr[x + 1]; + for (size_t x = 0; x < 2 * Nx; x += 2) + { + out_ptr[x + 1] = -out_ptr[x + 1]; + } } } - } - else - { - // Shuffle the entire row into the buffer - memcpy(buffer_row.data(), in_ptr + Nx * y_shuffled, Nx * sizeof(float)); - - // Copy the buffer to the output, with a zero imaginary part - for(size_t x = 0; x < 2 * Nx; x += 2) + else { - out_ptr[x] = buffer_row[x / 2]; + // Shuffle the entire row into the buffer + memcpy(buffer_row.data(), in_ptr + Nx * y_shuffled, Nx * sizeof(float)); + + // Copy the buffer to the output, with a zero imaginary part + for (size_t x = 0; x < 2 * Nx; x += 2) + { + out_ptr[x] = buffer_row[x / 2]; + } } - } - }, - out); + }, + out); } void NEFFTDigitReverseKernel::run(const Window &window, const ThreadInfo &info) diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.h b/src/core/NEON/kernels/NEFFTDigitReverseKernel.h index f436c364b2..ecf85ebc98 100644 --- a/src/core/NEON/kernels/NEFFTDigitReverseKernel.h +++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -70,7 +71,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *idx, + const FFTDigitReverseKernelInfo &config); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp index 44c841f626..4b58a7b9ac 100644 --- a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp +++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp @@ -28,10 +28,11 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/wrapper/traits.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/traits.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "support/ToolchainSupport.h" #include @@ -70,7 +71,7 @@ float32x2_t c_mul_neon(float32x2_t a, float32x2_t b) { using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - const float32x2_t mask = { -1.0, 1.0 }; + const float32x2_t mask = {-1.0, 1.0}; const float32x2_t tmp0 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{}); const float32x2_t tmp1 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{}); @@ -88,7 +89,7 @@ float32x2_t c_mul_neon_img(float32x2_t a, float img_constant) const float a_r = wrapper::vgetlane(a, 0); const float a_i = wrapper::vgetlane(a, 1); - const auto out = wrapper::vmul(float32x2_t{ -a_i, a_r }, float32x2_t{ img_constant, img_constant }); + const auto out = wrapper::vmul(float32x2_t{-a_i, a_r}, float32x2_t{img_constant, img_constant}); return out; } @@ -100,7 +101,8 @@ float32x2_t reduce_sum_5(float32x2_t a, float32x2_t b, float32x2_t c, float32x2_ return wrapper::vadd(t2, e); } -float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7) +float32x2_t reduce_sum_7( + float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7) { const auto t0 = wrapper::vadd(x1, x2); const auto t1 = wrapper::vadd(x3, x4); @@ -111,7 +113,14 @@ float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32 return wrapper::vadd(t00, t01); } -float32x2_t reduce_sum_8(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7, float32x2_t x8) +float32x2_t reduce_sum_8(float32x2_t x1, + float32x2_t x2, + float32x2_t x3, + float32x2_t x4, + float32x2_t x5, + float32x2_t x6, + float32x2_t x7, + float32x2_t x8) { const auto t0 = wrapper::vadd(x1, x2); const auto t1 = wrapper::vadd(x3, x4); @@ -141,15 +150,21 @@ void fft_3(float32x2_t &x, float32x2_t &y, float32x2_t &z, const float32x2_t &w, x = wrapper::vadd(a, b); x = wrapper::vadd(x, c); - const auto v1 = wrapper::vmul(float32x2_t{ 0.5f, 0.5 }, wrapper::vadd(b, c)); - const auto v2 = c_mul_neon(float32x2_t{ 0.f, -kSqrt3Div2 }, wrapper::vsub(b, c)); + const auto v1 = wrapper::vmul(float32x2_t{0.5f, 0.5}, wrapper::vadd(b, c)); + const auto v2 = c_mul_neon(float32x2_t{0.f, -kSqrt3Div2}, wrapper::vsub(b, c)); y = z = wrapper::vsub(a, v1); y = wrapper::vadd(y, v2); z = wrapper::vsub(z, v2); } -void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3) +void fft_4(float32x2_t &x1, + float32x2_t &x2, + float32x2_t &x3, + float32x2_t &x4, + const float32x2_t &w, + const float32x2_t &w2, + const float32x2_t &w3) { float32x2_t a = x1; float32x2_t b = c_mul_neon(w, x2); @@ -173,7 +188,15 @@ void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, c x4 = wrapper::vadd(x41, x42); } -void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3, const float32x2_t &w4) +void fft_5(float32x2_t &x1, + float32x2_t &x2, + float32x2_t &x3, + float32x2_t &x4, + float32x2_t &x5, + const float32x2_t &w, + const float32x2_t &w2, + const float32x2_t &w3, + const float32x2_t &w4) { const auto a = x1; const auto b = c_mul_neon(w, x2); @@ -181,25 +204,25 @@ void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f const auto d = c_mul_neon(w3, x4); const auto e = c_mul_neon(w4, x5); - const auto b0 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, b); - const auto b1 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, b); - const auto b2 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, b); - const auto b3 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, b); + const auto b0 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, b); + const auto b1 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, b); + const auto b2 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, b); + const auto b3 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, b); - const auto c0 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, c); - const auto c1 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, c); - const auto c2 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, c); - const auto c3 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, c); + const auto c0 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, c); + const auto c1 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, c); + const auto c2 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, c); + const auto c3 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, c); - const auto d0 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, d); - const auto d1 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, d); - const auto d2 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, d); - const auto d3 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, d); + const auto d0 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, d); + const auto d1 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, d); + const auto d2 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, d); + const auto d3 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, d); - const auto e0 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, e); - const auto e1 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, e); - const auto e2 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, e); - const auto e3 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, e); + const auto e0 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, e); + const auto e1 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, e); + const auto e2 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, e); + const auto e3 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, e); x1 = reduce_sum_5(a, b, c, d, e); x2 = reduce_sum_5(a, b0, c0, d0, e0); @@ -208,9 +231,19 @@ void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f x5 = reduce_sum_5(a, b3, c3, d3, e3); } -void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3, +void fft_7(float32x2_t &x1, + float32x2_t &x2, + float32x2_t &x3, + float32x2_t &x4, + float32x2_t &x5, + float32x2_t &x6, + float32x2_t &x7, + const float32x2_t &w, + const float32x2_t &w2, + const float32x2_t &w3, const float32x2_t &w4, - const float32x2_t &w5, const float32x2_t &w6) + const float32x2_t &w5, + const float32x2_t &w6) { const auto a = x1; const auto b = c_mul_neon(w, x2); @@ -220,47 +253,47 @@ void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f const auto f = c_mul_neon(w5, x6); const auto g = c_mul_neon(w6, x7); - const auto b0 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, b); - const auto b1 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, b); - const auto b2 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, b); - const auto b3 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, b); - const auto b4 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, b); - const auto b5 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, b); - - const auto c0 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, c); - const auto c1 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, c); - const auto c2 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, c); - const auto c3 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, c); - const auto c4 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, c); - const auto c5 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, c); - - const auto d0 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, d); - const auto d1 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, d); - const auto d2 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, d); - const auto d3 = c_mul_neon(float32x2_t{ -kW7_2, +kW7_3 }, d); - const auto d4 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, d); - const auto d5 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, d); - - const auto e0 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, e); - const auto e1 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, e); - const auto e2 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, e); - const auto e3 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, e); - const auto e4 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, e); - const auto e5 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, e); - - const auto f0 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, f); - const auto f1 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, f); - const auto f2 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, f); - const auto f3 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, f); - const auto f4 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, f); - const auto f5 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, f); - - const auto g0 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, g); - const auto g1 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, g); - const auto g2 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, g); - const auto g3 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, g); - const auto g4 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, g); - const auto g5 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, g); + const auto b0 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, b); + const auto b1 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, b); + const auto b2 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, b); + const auto b3 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, b); + const auto b4 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, b); + const auto b5 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, b); + + const auto c0 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, c); + const auto c1 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, c); + const auto c2 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, c); + const auto c3 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, c); + const auto c4 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, c); + const auto c5 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, c); + + const auto d0 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, d); + const auto d1 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, d); + const auto d2 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, d); + const auto d3 = c_mul_neon(float32x2_t{-kW7_2, +kW7_3}, d); + const auto d4 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, d); + const auto d5 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, d); + + const auto e0 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, e); + const auto e1 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, e); + const auto e2 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, e); + const auto e3 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, e); + const auto e4 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, e); + const auto e5 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, e); + + const auto f0 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, f); + const auto f1 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, f); + const auto f2 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, f); + const auto f3 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, f); + const auto f4 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, f); + const auto f5 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, f); + + const auto g0 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, g); + const auto g1 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, g); + const auto g2 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, g); + const auto g3 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, g); + const auto g4 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, g); + const auto g5 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, g); x1 = reduce_sum_7(a, b, c, d, e, f, g); x2 = reduce_sum_7(a, b0, c0, d0, e0, f0, g0); @@ -271,9 +304,20 @@ void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f x7 = reduce_sum_7(a, b5, c5, d5, e5, f5, g5); } -void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, float32x2_t &x8, const float32x2_t &w, const float32x2_t &w2, +void fft_8(float32x2_t &x1, + float32x2_t &x2, + float32x2_t &x3, + float32x2_t &x4, + float32x2_t &x5, + float32x2_t &x6, + float32x2_t &x7, + float32x2_t &x8, + const float32x2_t &w, + const float32x2_t &w2, const float32x2_t &w3, - const float32x2_t &w4, const float32x2_t &w5, const float32x2_t &w6, + const float32x2_t &w4, + const float32x2_t &w5, + const float32x2_t &w6, const float32x2_t &w7) { const auto a = x1; @@ -285,61 +329,61 @@ void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f const auto g = c_mul_neon(w6, x7); const auto h = c_mul_neon(w7, x8); - const auto b0 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, b); - const auto b1 = c_mul_neon(float32x2_t{ 0, -1 }, b); - const auto b2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, b); - const auto b3 = c_mul_neon(float32x2_t{ -1, 0 }, b); - const auto b4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, b); - const auto b5 = c_mul_neon(float32x2_t{ 0, 1 }, b); - const auto b6 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, b); - - const auto c0 = c_mul_neon(float32x2_t{ 0, -1 }, c); - const auto c1 = c_mul_neon(float32x2_t{ -1, 0 }, c); - const auto c2 = c_mul_neon(float32x2_t{ 0, 1 }, c); - const auto c3 = c_mul_neon(float32x2_t{ 1, 0 }, c); - const auto c4 = c_mul_neon(float32x2_t{ 0, -1 }, c); - const auto c5 = c_mul_neon(float32x2_t{ -1, 0 }, c); - const auto c6 = c_mul_neon(float32x2_t{ 0, 1 }, c); - - const auto d0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, d); - const auto d1 = c_mul_neon(float32x2_t{ 0, 1 }, d); - const auto d2 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, d); - const auto d3 = c_mul_neon(float32x2_t{ -1, 0 }, d); - const auto d4 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, d); - const auto d5 = c_mul_neon(float32x2_t{ 0, -1 }, d); - const auto d6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, d); - - const auto e0 = c_mul_neon(float32x2_t{ -1, 0 }, e); - const auto e1 = c_mul_neon(float32x2_t{ 1, 0 }, e); - const auto e2 = c_mul_neon(float32x2_t{ -1, 0 }, e); - const auto e3 = c_mul_neon(float32x2_t{ 1, 0 }, e); - const auto e4 = c_mul_neon(float32x2_t{ -1, 0 }, e); - const auto e5 = c_mul_neon(float32x2_t{ 1, 0 }, e); - const auto e6 = c_mul_neon(float32x2_t{ -1, 0 }, e); - - const auto f0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, f); - const auto f1 = c_mul_neon(float32x2_t{ 0, -1 }, f); - const auto f2 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, f); - const auto f3 = c_mul_neon(float32x2_t{ -1, 0 }, f); - const auto f4 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, f); - const auto f5 = c_mul_neon(float32x2_t{ 0, 1 }, f); - const auto f6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, f); - - const auto g0 = c_mul_neon(float32x2_t{ 0, 1 }, g); - const auto g1 = c_mul_neon(float32x2_t{ -1, 0 }, g); - const auto g2 = c_mul_neon(float32x2_t{ 0, -1 }, g); - const auto g3 = c_mul_neon(float32x2_t{ 1, 0 }, g); - const auto g4 = c_mul_neon(float32x2_t{ 0, 1 }, g); - const auto g5 = c_mul_neon(float32x2_t{ -1, 0 }, g); - const auto g6 = c_mul_neon(float32x2_t{ 0, -1 }, g); - - const auto h0 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, h); - const auto h1 = c_mul_neon(float32x2_t{ 0, 1 }, h); - const auto h2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, h); - const auto h3 = c_mul_neon(float32x2_t{ -1, 0 }, h); - const auto h4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, h); - const auto h5 = c_mul_neon(float32x2_t{ 0, -1 }, h); - const auto h6 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, h); + const auto b0 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, b); + const auto b1 = c_mul_neon(float32x2_t{0, -1}, b); + const auto b2 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, b); + const auto b3 = c_mul_neon(float32x2_t{-1, 0}, b); + const auto b4 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, b); + const auto b5 = c_mul_neon(float32x2_t{0, 1}, b); + const auto b6 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, b); + + const auto c0 = c_mul_neon(float32x2_t{0, -1}, c); + const auto c1 = c_mul_neon(float32x2_t{-1, 0}, c); + const auto c2 = c_mul_neon(float32x2_t{0, 1}, c); + const auto c3 = c_mul_neon(float32x2_t{1, 0}, c); + const auto c4 = c_mul_neon(float32x2_t{0, -1}, c); + const auto c5 = c_mul_neon(float32x2_t{-1, 0}, c); + const auto c6 = c_mul_neon(float32x2_t{0, 1}, c); + + const auto d0 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, d); + const auto d1 = c_mul_neon(float32x2_t{0, 1}, d); + const auto d2 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, d); + const auto d3 = c_mul_neon(float32x2_t{-1, 0}, d); + const auto d4 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, d); + const auto d5 = c_mul_neon(float32x2_t{0, -1}, d); + const auto d6 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, d); + + const auto e0 = c_mul_neon(float32x2_t{-1, 0}, e); + const auto e1 = c_mul_neon(float32x2_t{1, 0}, e); + const auto e2 = c_mul_neon(float32x2_t{-1, 0}, e); + const auto e3 = c_mul_neon(float32x2_t{1, 0}, e); + const auto e4 = c_mul_neon(float32x2_t{-1, 0}, e); + const auto e5 = c_mul_neon(float32x2_t{1, 0}, e); + const auto e6 = c_mul_neon(float32x2_t{-1, 0}, e); + + const auto f0 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, f); + const auto f1 = c_mul_neon(float32x2_t{0, -1}, f); + const auto f2 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, f); + const auto f3 = c_mul_neon(float32x2_t{-1, 0}, f); + const auto f4 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, f); + const auto f5 = c_mul_neon(float32x2_t{0, 1}, f); + const auto f6 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, f); + + const auto g0 = c_mul_neon(float32x2_t{0, 1}, g); + const auto g1 = c_mul_neon(float32x2_t{-1, 0}, g); + const auto g2 = c_mul_neon(float32x2_t{0, -1}, g); + const auto g3 = c_mul_neon(float32x2_t{1, 0}, g); + const auto g4 = c_mul_neon(float32x2_t{0, 1}, g); + const auto g5 = c_mul_neon(float32x2_t{-1, 0}, g); + const auto g6 = c_mul_neon(float32x2_t{0, -1}, g); + + const auto h0 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, h); + const auto h1 = c_mul_neon(float32x2_t{0, 1}, h); + const auto h2 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, h); + const auto h3 = c_mul_neon(float32x2_t{-1, 0}, h); + const auto h4 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, h); + const auto h5 = c_mul_neon(float32x2_t{0, -1}, h); + const auto h6 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, h); x1 = reduce_sum_8(a, b, c, d, e, f, g, h); x2 = reduce_sum_8(a, b0, c0, d0, e0, f0, g0, h0); @@ -352,18 +396,19 @@ void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f } template -void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_2_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { - auto a = float32x2_t{ 0, 0 }; - auto b = float32x2_t{ 0, 0 }; + auto a = float32x2_t{0, 0}; + auto b = float32x2_t{0, 0}; // Load inputs - if(first_stage) + if (first_stage) { const auto ab = wrapper::vloadq(in + k); a = wrapper::vgetlow(ab); @@ -379,7 +424,7 @@ void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR fft_2(a, b, w); // Write outputs - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); } @@ -394,12 +439,20 @@ void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_2_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_2_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -418,20 +471,21 @@ void fft_radix_2_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR } template -void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_3_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const auto w2 = c_mul_neon(w, w); - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { // Load inputs - float32x2_t a = { 0, 0 }; - float32x2_t b = { 0, 0 }; - float32x2_t c = { 0, 0 }; - if(first_stage) + float32x2_t a = {0, 0}; + float32x2_t b = {0, 0}; + float32x2_t c = {0, 0}; + if (first_stage) { const auto ab = wrapper::vloadq(in + k); a = wrapper::vgetlow(ab); @@ -447,7 +501,7 @@ void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR // Base-case prime transform fft_3(a, b, c, w, w2); - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); } @@ -462,14 +516,22 @@ void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_3_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_3_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const auto w2 = c_mul_neon(w, w); - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -489,21 +551,22 @@ void fft_radix_3_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR } template -void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_4_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const auto w2 = c_mul_neon(w, w); const auto w3 = c_mul_neon(w2, w); - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { - float32x2_t a = { 0, 0 }; - float32x2_t b = { 0, 0 }; - float32x2_t c = { 0, 0 }; - float32x2_t d = { 0, 0 }; - if(first_stage) + float32x2_t a = {0, 0}; + float32x2_t b = {0, 0}; + float32x2_t c = {0, 0}; + float32x2_t d = {0, 0}; + if (first_stage) { const auto ab = wrapper::vloadq(in + k); const auto cd = wrapper::vloadq(in + k + 4 * Nx); @@ -524,7 +587,7 @@ void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR // Base-case prime transform fft_4(a, b, c, d, w, w2, w3); - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d)); @@ -542,15 +605,23 @@ void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_4_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_4_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const auto w2 = c_mul_neon(w, w); const auto w3 = c_mul_neon(w2, w); - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -572,25 +643,26 @@ void fft_radix_4_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR } template -void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_5_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); const float32x2_t w4 = c_mul_neon(w3, w); - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { - float32x2_t a = { 0, 0 }; - float32x2_t b = { 0, 0 }; - float32x2_t c = { 0, 0 }; - float32x2_t d = { 0, 0 }; - float32x2_t e = { 0, 0 }; + float32x2_t a = {0, 0}; + float32x2_t b = {0, 0}; + float32x2_t c = {0, 0}; + float32x2_t d = {0, 0}; + float32x2_t e = {0, 0}; // Load inputs - if(first_stage) + if (first_stage) { const auto ab = wrapper::vloadq(in + k); const auto cd = wrapper::vloadq(in + k + 4 * Nx); @@ -613,7 +685,7 @@ void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR fft_5(a, b, c, d, e, w, w2, w3, w4); // Store outputs - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d)); @@ -632,16 +704,24 @@ void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_5_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_5_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); const float32x2_t w4 = c_mul_neon(w3, w); - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -666,10 +746,11 @@ void fft_radix_5_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR } template -void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_7_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); @@ -677,18 +758,18 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR const float32x2_t w5 = c_mul_neon(w4, w); const float32x2_t w6 = c_mul_neon(w5, w); - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { - float32x2_t a = { 0, 0 }; - float32x2_t b = { 0, 0 }; - float32x2_t c = { 0, 0 }; - float32x2_t d = { 0, 0 }; - float32x2_t e = { 0, 0 }; - float32x2_t f = { 0, 0 }; - float32x2_t g = { 0, 0 }; + float32x2_t a = {0, 0}; + float32x2_t b = {0, 0}; + float32x2_t c = {0, 0}; + float32x2_t d = {0, 0}; + float32x2_t e = {0, 0}; + float32x2_t f = {0, 0}; + float32x2_t g = {0, 0}; // Load inputs - if(first_stage) + if (first_stage) { const auto ab = wrapper::vloadq(in + k); const auto cd = wrapper::vloadq(in + k + 4 * Nx); @@ -715,7 +796,7 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR // Base-case prime transform fft_7(a, b, c, d, e, f, g, w, w2, w3, w4, w5, w6); - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d)); @@ -737,10 +818,18 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_7_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); @@ -748,7 +837,7 @@ void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR const float32x2_t w5 = c_mul_neon(w4, w); const float32x2_t w6 = c_mul_neon(w5, w); - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -777,10 +866,11 @@ void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR } template -void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) +void fft_radix_8_axes_0( + float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); @@ -789,20 +879,20 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR const float32x2_t w6 = c_mul_neon(w5, w); const float32x2_t w7 = c_mul_neon(w6, w); - for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix) { // Load inputs - float32x2_t a = { 0, 0 }; - float32x2_t b = { 0, 0 }; - float32x2_t c = { 0, 0 }; - float32x2_t d = { 0, 0 }; - float32x2_t e = { 0, 0 }; - float32x2_t f = { 0, 0 }; - float32x2_t g = { 0, 0 }; - float32x2_t h = { 0, 0 }; + float32x2_t a = {0, 0}; + float32x2_t b = {0, 0}; + float32x2_t c = {0, 0}; + float32x2_t d = {0, 0}; + float32x2_t e = {0, 0}; + float32x2_t f = {0, 0}; + float32x2_t g = {0, 0}; + float32x2_t h = {0, 0}; // Base-case prime transform - if(first_stage) + if (first_stage) { const auto ab = wrapper::vloadq(in + k); const auto cd = wrapper::vloadq(in + k + 4 * Nx); @@ -834,7 +924,7 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR fft_8(a, b, c, d, e, f, g, h, w, w2, w3, w4, w5, w6, w7); // Store outputs - if(first_stage) + if (first_stage) { wrapper::vstore(out + k, wrapper::vcombine(a, b)); wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d)); @@ -858,10 +948,18 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR } } -void fft_radix_8_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x) +void fft_radix_8_axes_1(float *out, + float *in, + unsigned int Nx, + unsigned int NxRadix, + const float32x2_t &w_m, + unsigned int N, + unsigned int M, + unsigned int in_pad_x, + unsigned int out_pad_x) { - float32x2_t w{ 1.0f, 0.0f }; - for(unsigned int j = 0; j < Nx; j++) + float32x2_t w{1.0f, 0.0f}; + for (unsigned int j = 0; j < Nx; j++) { const float32x2_t w2 = c_mul_neon(w, w); const float32x2_t w3 = c_mul_neon(w2, w); @@ -870,7 +968,7 @@ void fft_radix_8_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR const float32x2_t w6 = c_mul_neon(w5, w); const float32x2_t w7 = c_mul_neon(w6, w); - for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) + for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix) { // Load inputs float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k); @@ -908,7 +1006,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_UNUSED(config); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -917,11 +1015,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config) +std::pair +validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config) { ARM_COMPUTE_UNUSED(config); - if(output != nullptr) + if (output != nullptr) { auto_init_if_empty(*output, *input); } @@ -942,7 +1041,7 @@ void NEFFTRadixStageKernel::set_radix_stage_axis0(const FFTRadixStageKernelInfo // FFT table axis 0: [radix, first_stage] static std::map> fft_table_axis0; - if(fft_table_axis0.empty()) + if (fft_table_axis0.empty()) { fft_table_axis0[2][false] = &fft_radix_2_axes_0; fft_table_axis0[3][false] = &fft_radix_3_axes_0; @@ -967,7 +1066,7 @@ void NEFFTRadixStageKernel::set_radix_stage_axis1(const FFTRadixStageKernelInfo // FFT table axis 1: [radix, first_stage] static std::map fft_table_axis1; - if(fft_table_axis1.empty()) + if (fft_table_axis1.empty()) { fft_table_axis1[2] = &fft_radix_2_axes_1; fft_table_axis1[3] = &fft_radix_3_axes_1; @@ -985,12 +1084,13 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT ARM_COMPUTE_ERROR_ON_NULLPTR(input); // Output auto inizialitation if not yet initialized - if(output != nullptr) + if (output != nullptr) { auto_init_if_empty(*output->info(), *input->info()->clone()); } - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config)); _input = input; _output = (output == nullptr) ? input : output; @@ -998,7 +1098,7 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT _axis = config.axis; _radix = config.radix; - switch(config.axis) + switch (config.axis) { case 0: set_radix_stage_axis0(config); @@ -1012,26 +1112,28 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT } // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr, config); + auto win_config = + validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr, config); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); INEKernel::configure(win_config.second); } -Status NEFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config) +Status NEFFTRadixStageKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const FFTRadixStageKernelInfo &config) { const bool run_in_place = (output == nullptr) || (output == input); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), - (run_in_place) ? nullptr : output->clone().get(), - config) - .first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config) + .first); return Status{}; } std::set NEFFTRadixStageKernel::supported_radix() { - return std::set { 2, 3, 4, 5, 7, 8 }; + return std::set{2, 3, 4, 5, 7, 8}; } void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info) @@ -1049,28 +1151,32 @@ void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info) // Precompute FFT constants const unsigned int NxRadix = _radix * _Nx; const float alpha = 2.0f * kPi / float(NxRadix); - const float32x2_t w_m{ cosf(alpha), -sinf(alpha) }; + const float32x2_t w_m{cosf(alpha), -sinf(alpha)}; - if(_axis == 0) + if (_axis == 0) { const unsigned int N = _input->info()->dimension(0); - execute_window_loop(input_window, [&](const Coordinates &) - { - _func_0(reinterpret_cast(out.ptr()), reinterpret_cast(in.ptr()), _Nx, NxRadix, w_m, N); - }, - in, out); + execute_window_loop( + input_window, + [&](const Coordinates &) { + _func_0(reinterpret_cast(out.ptr()), reinterpret_cast(in.ptr()), _Nx, NxRadix, w_m, + N); + }, + in, out); } else { const unsigned int N = _input->info()->dimension(0); const unsigned int M = _input->info()->dimension(1); - execute_window_loop(input_window, [&](const Coordinates &) - { - _func_1(reinterpret_cast(out.ptr()), reinterpret_cast(in.ptr()), _Nx, NxRadix, w_m, N, M, - _input->info()->padding().right + _input->info()->padding().left, - _output->info()->padding().right + _output->info()->padding().left); - }, - in, out); + execute_window_loop( + input_window, + [&](const Coordinates &) + { + _func_1(reinterpret_cast(out.ptr()), reinterpret_cast(in.ptr()), _Nx, NxRadix, w_m, N, + M, _input->info()->padding().right + _input->info()->padding().left, + _output->info()->padding().right + _output->info()->padding().left); + }, + in, out); } ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.h b/src/core/NEON/kernels/NEFFTRadixStageKernel.h index 2291a1068c..54f32efa23 100644 --- a/src/core/NEON/kernels/NEFFTRadixStageKernel.h +++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/NEON/INEKernel.h" #include @@ -92,8 +93,17 @@ private: void set_radix_stage_axis0(const FFTRadixStageKernelInfo &config); void set_radix_stage_axis1(const FFTRadixStageKernelInfo &config); - using FFTFunctionPointerAxis0 = std::function; - using FFTFunctionPointerAxis1 = std::function; + using FFTFunctionPointerAxis0 = + std::function; + using FFTFunctionPointerAxis1 = std::function; FFTFunctionPointerAxis0 _func_0; FFTFunctionPointerAxis1 _func_1; diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.cpp b/src/core/NEON/kernels/NEFFTScaleKernel.cpp index 5ec330bebc..9fe561fc59 100644 --- a/src/core/NEON/kernels/NEFFTScaleKernel.cpp +++ b/src/core/NEON/kernels/NEFFTScaleKernel.cpp @@ -28,9 +28,10 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include @@ -41,8 +42,8 @@ namespace void scale_complex(float *c_in, float *c_out, bool is_conjugate, float scale) { const auto a = wrapper::vload(c_in); - auto b = wrapper::vdiv(a, float32x2_t{ scale, scale }); - if(is_conjugate) + auto b = wrapper::vdiv(a, float32x2_t{scale, scale}); + if (is_conjugate) { const float img_part = wrapper::vgetlane(b, 1); b = wrapper::vsetlane(-img_part, b, 1); @@ -56,7 +57,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); @@ -71,7 +72,7 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen // Configure kernel window Window win = calculate_max_window(*input, Steps()); - if(output != nullptr) + if (output != nullptr) { // Output auto inizialitation if not yet initialized auto_init_if_empty(*output, *input->clone()); @@ -126,10 +127,10 @@ void NEFFTScaleKernel::run(const Window &window, const ThreadInfo &info) Iterator in(_input, input_window); Iterator out(_run_in_place ? _input : _output, input_window); - execute_window_loop(window, [&](const Coordinates &) - { - scale_complex(reinterpret_cast(in.ptr()), reinterpret_cast(out.ptr()), _is_conj, _scale); - }, - in, out); + execute_window_loop( + window, + [&](const Coordinates &) + { scale_complex(reinterpret_cast(in.ptr()), reinterpret_cast(out.ptr()), _is_conj, _scale); }, + in, out); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.h b/src/core/NEON/kernels/NEFFTScaleKernel.h index 24a19f98ba..608cf5ea34 100644 --- a/src/core/NEON/kernels/NEFFTScaleKernel.h +++ b/src/core/NEON/kernels/NEFFTScaleKernel.h @@ -24,10 +24,10 @@ #ifndef ARM_COMPUTE_NEFFTSCALEKERNEL_H #define ARM_COMPUTE_NEFFTSCALEKERNEL_H -#include "src/core/NEON/INEKernel.h" - #include "arm_compute/core/KernelDescriptors.h" +#include "src/core/NEON/INEKernel.h" + namespace arm_compute { // Forward declarations diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp index 1c7c1f9763..00b0c0ae8d 100644 --- a/src/core/NEON/kernels/NEFillBorderKernel.cpp +++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp @@ -30,14 +30,19 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/kernels/NEFillBorderKernel.h" + #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/NEFillBorderKernel.h" namespace arm_compute { namespace { -inline void fill_constant_value_single_channel_special(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value) +inline void fill_constant_value_single_channel_special(ITensor *tensor, + const Window &window, + unsigned int right, + unsigned int bottom, + const PixelValue &constant_border_value) { float border_value; constant_border_value.get(border_value); @@ -52,39 +57,43 @@ inline void fill_constant_value_single_channel_special(ITensor *tensor, const Wi Iterator vertical_it(tensor, vertical); - execute_window_loop(vertical, [&](const Coordinates &) - { - const auto row_start = reinterpret_cast(start_valid_region + vertical_it.offset()); + execute_window_loop( + vertical, + [&](const Coordinates &) + { + const auto row_start = reinterpret_cast(start_valid_region + vertical_it.offset()); - // Fill left and right borders - *(row_start - 1) = border_value; - std::fill_n(row_start + width, right, border_value); - }, - vertical_it); + // Fill left and right borders + *(row_start - 1) = border_value; + std::fill_n(row_start + width, right, border_value); + }, + vertical_it); // Top and bottom border Iterator plane_it(tensor, window); // Iterate over all XY planes - execute_window_loop(window, [&](const Coordinates &) - { - uint8_t *base_addr = start_valid_region + plane_it.offset(); - // Top border - const auto row_start = reinterpret_cast(base_addr - stridey); - // Fill top rows including left/right borders - std::fill_n(row_start - 1, 1 + width + right, border_value); - - // Bottom border - const unsigned low_border_size = height + bottom; - for(unsigned int i = height; i < low_border_size; ++i) + execute_window_loop( + window, + [&](const Coordinates &) { - const auto row_start = reinterpret_cast(base_addr + i * stridey); - - // Fill bottom rows including left/right borders + uint8_t *base_addr = start_valid_region + plane_it.offset(); + // Top border + const auto row_start = reinterpret_cast(base_addr - stridey); + // Fill top rows including left/right borders std::fill_n(row_start - 1, 1 + width + right, border_value); - } - }, - plane_it); + + // Bottom border + const unsigned low_border_size = height + bottom; + for (unsigned int i = height; i < low_border_size; ++i) + { + const auto row_start = reinterpret_cast(base_addr + i * stridey); + + // Fill bottom rows including left/right borders + std::fill_n(row_start - 1, 1 + width + right, border_value); + } + }, + plane_it); } } // namespace @@ -93,14 +102,20 @@ NEFillBorderKernel::NEFillBorderKernel() { } -void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) +void NEFillBorderKernel::configure(ITensor *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value) { ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); _tensor = tensor; configure(tensor->info(), border_size, border_mode, constant_border_value); } -void NEFillBorderKernel::configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) +void NEFillBorderKernel::configure(ITensorInfo *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value) { ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions. @@ -124,7 +139,7 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_UNUSED(info); // If there is no border: early exit - if(_border_size.empty()) + if (_border_size.empty()) { return; } @@ -132,13 +147,14 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - switch(_mode) + switch (_mode) { case BorderMode::CONSTANT: { - if(_border_size.left == 1 && _border_size.top == 1 && _tensor->info()->data_type() == DataType::F32) + if (_border_size.left == 1 && _border_size.top == 1 && _tensor->info()->data_type() == DataType::F32) { - fill_constant_value_single_channel_special(_tensor, window, _border_size.right, _border_size.bottom, _constant_border_value); + fill_constant_value_single_channel_special(_tensor, window, _border_size.right, _border_size.bottom, + _constant_border_value); } else { @@ -176,46 +192,56 @@ void NEFillBorderKernel::fill_replicate_single_channel(const Window &window) Iterator vertical_it(_tensor, vertical); - execute_window_loop(vertical, [&](const Coordinates &) - { - uint8_t *base_addr = start_valid_region + vertical_it.offset(); - // Fill left and right borders - for(unsigned int i = 0; i < _border_size.left; ++i) + execute_window_loop( + vertical, + [&](const Coordinates &) { - std::memcpy(base_addr + static_cast(i - _border_size.left) * element_size, vertical_it.ptr(), element_size); - } + uint8_t *base_addr = start_valid_region + vertical_it.offset(); + // Fill left and right borders + for (unsigned int i = 0; i < _border_size.left; ++i) + { + std::memcpy(base_addr + static_cast(i - _border_size.left) * element_size, vertical_it.ptr(), + element_size); + } - for(unsigned int i = 0; i < _border_size.right; ++i) - { - std::memcpy(base_addr + (width + i) * element_size, vertical_it.ptr() + (width - 1) * element_size, element_size); - } - }, - vertical_it); + for (unsigned int i = 0; i < _border_size.right; ++i) + { + std::memcpy(base_addr + (width + i) * element_size, vertical_it.ptr() + (width - 1) * element_size, + element_size); + } + }, + vertical_it); // Top and bottom border Iterator plane_it(_tensor, window); // Iterate over all XY planes - execute_window_loop(window, [&](const Coordinates &) - { - uint8_t *base_addr = start_valid_region + plane_it.offset(); - // Top border - for(int i = -_border_size.top; i < 0; ++i) + execute_window_loop( + window, + [&](const Coordinates &) { - // Copy top rows including left/right borders - std::memcpy(base_addr + i * static_cast(_tensor->info()->strides_in_bytes()[1]) - _border_size.left * element_size, - base_addr - _border_size.left * element_size, (_border_size.left + width + _border_size.right) * element_size); - } + uint8_t *base_addr = start_valid_region + plane_it.offset(); + // Top border + for (int i = -_border_size.top; i < 0; ++i) + { + // Copy top rows including left/right borders + std::memcpy(base_addr + i * static_cast(_tensor->info()->strides_in_bytes()[1]) - + _border_size.left * element_size, + base_addr - _border_size.left * element_size, + (_border_size.left + width + _border_size.right) * element_size); + } - // Bottom border - for(unsigned int i = height; i < height + _border_size.bottom; ++i) - { - // Copy bottom rows including left/right borders - std::memcpy(base_addr + i * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size, - base_addr + (height - 1) * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size, (_border_size.left + width + _border_size.right) * element_size); - } - }, - plane_it); + // Bottom border + for (unsigned int i = height; i < height + _border_size.bottom; ++i) + { + // Copy bottom rows including left/right borders + std::memcpy(base_addr + i * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size, + base_addr + (height - 1) * _tensor->info()->strides_in_bytes()[1] - + _border_size.left * element_size, + (_border_size.left + width + _border_size.right) * element_size); + } + }, + plane_it); } void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window) @@ -232,50 +258,57 @@ void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window Iterator vertical_it(_tensor, vertical); - execute_window_loop(vertical, [&](const Coordinates &) - { - uint8_t *base_addr = start_valid_region + vertical_it.offset(); - // Fill left and right borders - for(unsigned int i = 0; i < _border_size.left; ++i) + execute_window_loop( + vertical, + [&](const Coordinates &) { - std::memcpy(base_addr + static_cast(i - _border_size.left) * element_size, &_constant_border_value, element_size); - } + uint8_t *base_addr = start_valid_region + vertical_it.offset(); + // Fill left and right borders + for (unsigned int i = 0; i < _border_size.left; ++i) + { + std::memcpy(base_addr + static_cast(i - _border_size.left) * element_size, &_constant_border_value, + element_size); + } - for(unsigned int i = 0; i < _border_size.right; ++i) - { - std::memcpy(base_addr + (width + i) * element_size, &_constant_border_value, element_size); - } - }, - vertical_it); + for (unsigned int i = 0; i < _border_size.right; ++i) + { + std::memcpy(base_addr + (width + i) * element_size, &_constant_border_value, element_size); + } + }, + vertical_it); // Top and bottom border Iterator plane_it(_tensor, window); // Iterate over all XY planes - execute_window_loop(window, [&](const Coordinates &) - { - uint8_t *base_addr = start_valid_region + plane_it.offset(); - // Top border - for(int i = -_border_size.top; i < 0; ++i) + execute_window_loop( + window, + [&](const Coordinates &) { - // Fill top rows including left/right borders - for(unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j) + uint8_t *base_addr = start_valid_region + plane_it.offset(); + // Top border + for (int i = -_border_size.top; i < 0; ++i) { - std::memcpy(base_addr + i * stridey + static_cast(j - _border_size.left) * element_size, &_constant_border_value, element_size); + // Fill top rows including left/right borders + for (unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j) + { + std::memcpy(base_addr + i * stridey + static_cast(j - _border_size.left) * element_size, + &_constant_border_value, element_size); + } } - } - // Bottom border - const unsigned low_border_size = height + _border_size.bottom; - for(unsigned int i = height; i < low_border_size; ++i) - { - // Fill bottom rows including left/right borders - for(unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j) + // Bottom border + const unsigned low_border_size = height + _border_size.bottom; + for (unsigned int i = height; i < low_border_size; ++i) { - std::memcpy(base_addr + i * stridey + static_cast(j - _border_size.left) * element_size, &_constant_border_value, element_size); + // Fill bottom rows including left/right borders + for (unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j) + { + std::memcpy(base_addr + i * stridey + static_cast(j - _border_size.left) * element_size, + &_constant_border_value, element_size); + } } - } - }, - plane_it); + }, + plane_it); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEFillBorderKernel.h b/src/core/NEON/kernels/NEFillBorderKernel.h index 2c851583ed..aaad108bfa 100644 --- a/src/core/NEON/kernels/NEFillBorderKernel.h +++ b/src/core/NEON/kernels/NEFillBorderKernel.h @@ -26,6 +26,7 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -64,7 +65,10 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. * */ - void configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); + void configure(ITensor *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value = PixelValue()); /** Initialise the function. * * @note This kernel fills the borders within the XY-planes. @@ -75,7 +79,10 @@ public: * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. * */ - void configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); + void configure(ITensorInfo *tensor, + BorderSize border_size, + BorderMode border_mode, + const PixelValue &constant_border_value = PixelValue()); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp index 51a69046a9..cbe5136fb1 100644 --- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp +++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp @@ -22,7 +22,6 @@ * SOFTWARE. */ #include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h" -#include "src/cpu/kernels/fuse_batch_normalization/list.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" @@ -30,12 +29,14 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/common/cpuinfo/CpuIsaInfo.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/wrapper/wrapper.h" #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/fuse_batch_normalization/list.h" #include @@ -52,8 +53,16 @@ struct FuseBatchNormalizeSelectorData }; using FBNSelectorPtr = std::add_pointer::type; -using FBNUKernelPtr = std::add_pointer::type; +using FBNUKernelPtr = std::add_pointer::type; struct FBNUKernel { @@ -62,73 +71,63 @@ struct FBNUKernel FBNUKernelPtr ukernel; }; -static const FBNUKernel available_kernels[] = -{ - { - "fused_batch_normalization_conv_NHWC_F16", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; - }, - REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16) - }, - { - "fused_batch_normalization_conv_NCHW_F16", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; - }, - REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16) - }, - { - "fused_batch_normalization_dwc_NHWC_F16", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; - }, - REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f16) - }, - { - "fused_batch_normalization_dwc_NCHW_F16", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; - }, - REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f16) - }, - { - "fused_batch_normalization_conv_NHWC_F32", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F32 && data.dl == DataLayout::NHWC && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; - }, - REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32) - }, - { - "fused_batch_normalization_conv_NCHW_F32", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F32 && data.dl == DataLayout::NCHW && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; - }, - REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32) - }, - { - "fused_batch_normalization_dwc_NHWC_F32", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F32 && data.dl == DataLayout::NHWC && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; - }, - REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f32) - }, - { - "fused_batch_normalization_dwc_NCHW_F32", - [](const FuseBatchNormalizeSelectorData & data) - { - return data.dt == DataType::F32 && data.dl == DataLayout::NCHW && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; - }, - REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f32) - } -}; +static const FBNUKernel available_kernels[] = { + {"fused_batch_normalization_conv_NHWC_F16", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 && + data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; + }, + REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)}, + {"fused_batch_normalization_conv_NCHW_F16", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 && + data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; + }, + REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)}, + {"fused_batch_normalization_dwc_NHWC_F16", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 && + data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; + }, + REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f16)}, + {"fused_batch_normalization_dwc_NCHW_F16", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 && + data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; + }, + REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f16)}, + {"fused_batch_normalization_conv_NHWC_F32", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F32 && data.dl == DataLayout::NHWC && + data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; + }, + REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)}, + {"fused_batch_normalization_conv_NCHW_F32", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F32 && data.dl == DataLayout::NCHW && + data.fbn_type == FuseBatchNormalizationType::CONVOLUTION; + }, + REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)}, + {"fused_batch_normalization_dwc_NHWC_F32", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F32 && data.dl == DataLayout::NHWC && + data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; + }, + REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f32)}, + {"fused_batch_normalization_dwc_NCHW_F32", + [](const FuseBatchNormalizeSelectorData &data) + { + return data.dt == DataType::F32 && data.dl == DataLayout::NCHW && + data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION; + }, + REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f32)}}; /** Micro-kernel selector * @@ -140,9 +139,9 @@ static const FBNUKernel available_kernels[] = */ const FBNUKernel *get_implementation(const FuseBatchNormalizeSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -150,10 +149,16 @@ const FBNUKernel *get_implementation(const FuseBatchNormalizeSelectorData &data) return nullptr; } -Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +Status validate_arguments(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias, + const ITensorInfo *bn_beta, + const ITensorInfo *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { ARM_COMPUTE_UNUSED(epsilon); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var); @@ -164,43 +169,44 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b ARM_COMPUTE_RETURN_ERROR_ON(input_bias == nullptr && fused_bias == nullptr); ARM_COMPUTE_RETURN_ERROR_ON(bn_mean->num_dimensions() > 1); - if(fbn_type == FuseBatchNormalizationType::CONVOLUTION) + if (fbn_type == FuseBatchNormalizationType::CONVOLUTION) { ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(3) != bn_mean->dimension(0)); } else { - const size_t channel_idx = get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL); + const size_t channel_idx = + get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(channel_idx) != bn_mean->dimension(0)); } // Validate bias - if(input_bias != nullptr) + if (input_bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, input_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, input_bias); } // Validate beta - if(bn_beta != nullptr) + if (bn_beta != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_beta); } // Validate gamma - if(bn_gamma != nullptr) + if (bn_gamma != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_gamma); } // Validate output weights - if(fused_weights != nullptr && fused_weights->total_size() != 0) + if (fused_weights != nullptr && fused_weights->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_weights, fused_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input_weights, fused_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_weights); } // Validate output bias - if(fused_bias != nullptr && fused_bias->total_size() != 0) + if (fused_bias != nullptr && fused_bias->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_bias); @@ -212,15 +218,31 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b } // namespace NEFuseBatchNormalizationKernel::NEFuseBatchNormalizationKernel() - : _input_weights(nullptr), _input_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(), - _run_in_place_weights(false), _run_in_place_bias(false), _func(nullptr) + : _input_weights(nullptr), + _input_bias(nullptr), + _bn_mean(nullptr), + _bn_var(nullptr), + _bn_gamma(nullptr), + _bn_beta(nullptr), + _fused_weights(nullptr), + _fused_bias(nullptr), + _epsilon(), + _run_in_place_weights(false), + _run_in_place_bias(false), + _func(nullptr) { } -void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, - ITensor *fused_weights, ITensor *fused_bias, - const ITensor *input_bias, const ITensor *bn_beta, const ITensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, + const ITensor *bn_mean, + const ITensor *bn_var, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *input_bias, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var); @@ -238,27 +260,27 @@ void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, con _run_in_place_bias = (fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias); // Auto initialize outputs - if(_fused_weights != nullptr) + if (_fused_weights != nullptr) { // Output tensor auto initialization if not yet initialized auto_init_if_empty(*_fused_weights->info(), *_input_weights->info()->clone()); } - if(_fused_bias != nullptr) + if (_fused_bias != nullptr) { // Output tensor auto initialization if not yet initialized auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone()); } // Validate arguments - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_weights->info(), bn_mean->info(), bn_var->info(), - (fused_weights != nullptr) ? fused_weights->info() : nullptr, - (fused_bias != nullptr) ? fused_bias->info() : nullptr, - (input_bias != nullptr) ? input_bias->info() : nullptr, - (bn_beta != nullptr) ? bn_beta->info() : nullptr, - (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, - epsilon, fbn_type)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments( + input_weights->info(), bn_mean->info(), bn_var->info(), + (fused_weights != nullptr) ? fused_weights->info() : nullptr, + (fused_bias != nullptr) ? fused_bias->info() : nullptr, (input_bias != nullptr) ? input_bias->info() : nullptr, + (bn_beta != nullptr) ? bn_beta->info() : nullptr, (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, epsilon, + fbn_type)); - const auto *uk = get_implementation(FuseBatchNormalizeSelectorData{ input_weights->info()->data_type(), input_weights->info()->data_layout(), fbn_type, CPUInfo::get().get_isa() }); + const auto *uk = get_implementation(FuseBatchNormalizeSelectorData{ + input_weights->info()->data_type(), input_weights->info()->data_layout(), fbn_type, CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON_NULLPTR(uk); ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr); _func = uk->ukernel; @@ -268,12 +290,19 @@ void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, con INEKernel::configure(win); } -Status NEFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +Status NEFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias, + const ITensorInfo *bn_beta, + const ITensorInfo *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type)); return Status{}; } @@ -284,6 +313,7 @@ void NEFuseBatchNormalizationKernel::run(const Window &window, const ThreadInfo ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); ARM_COMPUTE_ERROR_ON(_func == nullptr); - (*_func)(_input_weights, _input_bias, _fused_weights, _fused_bias, _bn_mean, _bn_var, _bn_beta, _bn_gamma, _epsilon, window); + (*_func)(_input_weights, _input_bias, _fused_weights, _fused_bias, _bn_mean, _bn_var, _bn_beta, _bn_gamma, _epsilon, + window); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h index ee767b01c8..f23280d55a 100644 --- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h +++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h @@ -66,9 +66,16 @@ public: * @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f. * @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to CONVOLUTION. */ - void configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *input_bias = nullptr, const ITensor *bn_beta = nullptr, const ITensor *bn_gamma = nullptr, - float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); + void configure(const ITensor *input_weights, + const ITensor *bn_mean, + const ITensor *bn_var, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *input_bias = nullptr, + const ITensor *bn_beta = nullptr, + const ITensor *bn_gamma = nullptr, + float epsilon = 0.001f, + FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); /** Static function to check if given info will lead to a valid configuration of @ref NEFuseBatchNormalizationKernel * * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC @@ -86,10 +93,16 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr, - float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); + static Status validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias = nullptr, + const ITensorInfo *bn_beta = nullptr, + const ITensorInfo *bn_gamma = nullptr, + float epsilon = 0.001f, + FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -107,8 +120,16 @@ private: bool _run_in_place_weights; bool _run_in_place_bias; - using FuseBatchNormFunction = void(const ITensor *input_weights, const ITensor *input_bias, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window); + using FuseBatchNormFunction = void(const ITensor *input_weights, + const ITensor *input_bias, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *bn_mean, + const ITensor *bn_var, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + const Window &window); FuseBatchNormFunction *_func; }; diff --git a/src/core/NEON/kernels/NEGatherKernel.cpp b/src/core/NEON/kernels/NEGatherKernel.cpp index 11332ffac8..f1d457d399 100644 --- a/src/core/NEON/kernels/NEGatherKernel.cpp +++ b/src/core/NEON/kernels/NEGatherKernel.cpp @@ -27,9 +27,10 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -42,20 +43,22 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); - if(axis < 0) + if (axis < 0) { axis += input->num_dimensions(); } ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast(input->num_dimensions())); - ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > Coordinates::num_max_dimensions); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > + Coordinates::num_max_dimensions); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); - TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), axis); + TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape( + input->tensor_shape(), indices->tensor_shape(), axis); ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size()); } @@ -81,23 +84,23 @@ void NEGatherKernel::gather_common(const Window &window, const ThreadInfo &info) const auto idx_info = _indices->info(); const auto dst_info = _output->info(); - const auto num_dims = dst_info->num_dimensions(); + const auto num_dims = dst_info->num_dimensions(); const auto chunk_stride = src_info->strides_in_bytes()[_axis]; const auto window_start_x = window.x().start(); - const auto window_end_x = window.x().end(); - auto window_size_x = src_info->element_size(); + const auto window_end_x = window.x().end(); + auto window_size_x = src_info->element_size(); const auto idx_limit = static_cast(src_info->tensor_shape()[_axis]); - if(_axis != 0) + if (_axis != 0) { dst_win.set(0, Window::Dimension(window_start_x, window_start_x + 1, 1)); window_size_x *= window_end_x - window_start_x; } // Compute source and index tensors window based on the output window. - auto src_win = dst_win; + auto src_win = dst_win; Window idx_win; for (size_t i = 0; i < idx_info->num_dimensions(); ++i) @@ -109,22 +112,27 @@ void NEGatherKernel::gather_common(const Window &window, const ThreadInfo &info) // Use the custom strides to access all three tensors using the same loop. Iterator src_it(num_dims, _src_it_strides, _input->buffer(), src_info->offset_first_element_in_bytes(), src_win); Iterator idx_it(num_dims, _idx_it_strides, _indices->buffer(), idx_info->offset_first_element_in_bytes(), idx_win); - Iterator dst_it(num_dims, dst_info->strides_in_bytes(), _output->buffer(), dst_info->offset_first_element_in_bytes(), dst_win); - - execute_window_loop(dst_win, [&](const Coordinates &) { - const auto idx = *reinterpret_cast(idx_it.ptr()); - - if(idx >= 0 && idx < idx_limit) - { - const auto src_ptr = src_it.ptr() + idx * chunk_stride; + Iterator dst_it(num_dims, dst_info->strides_in_bytes(), _output->buffer(), + dst_info->offset_first_element_in_bytes(), dst_win); - std::copy_n(src_ptr, window_size_x, dst_it.ptr()); - } - else + execute_window_loop( + dst_win, + [&](const Coordinates &) { - std::fill_n(dst_it.ptr(), window_size_x, 0); - } - }, src_it, idx_it, dst_it); + const auto idx = *reinterpret_cast(idx_it.ptr()); + + if (idx >= 0 && idx < idx_limit) + { + const auto src_ptr = src_it.ptr() + idx * chunk_stride; + + std::copy_n(src_ptr, window_size_x, dst_it.ptr()); + } + else + { + std::fill_n(dst_it.ptr(), window_size_x, 0); + } + }, + src_it, idx_it, dst_it); } void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis) @@ -137,13 +145,13 @@ void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITe _output = output; _axis = axis; - if(_axis < 0) + if (_axis < 0) { _axis += input->info()->num_dimensions(); } ARM_COMPUTE_ERROR_ON(0 > _axis || _axis >= static_cast(input->info()->num_dimensions())); - switch(_indices->info()->data_type()) + switch (_indices->info()->data_type()) { case DataType::U32: _func = &NEGatherKernel::gather_common; @@ -157,7 +165,8 @@ void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITe } // Output auto initialization if not yet initialized - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis); + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape( + input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); // Create window @@ -169,30 +178,31 @@ void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITe // These will be used to iterate lock-step through all tensors (input, indices and output). size_t dim_no = 0; - const auto input_info = input->info(); + const auto input_info = input->info(); const auto &input_strides = input_info->strides_in_bytes(); - const auto indices_info = indices->info(); - const auto &indices_strides = indices_info->strides_in_bytes(); - const auto indices_num_dims = indices_info->num_dimensions(); + const auto indices_info = indices->info(); + const auto &indices_strides = indices_info->strides_in_bytes(); + const auto indices_num_dims = indices_info->num_dimensions(); - for(; dim_no < static_cast(_axis); ++dim_no) + for (; dim_no < static_cast(_axis); ++dim_no) { _src_it_strides[dim_no] = input_strides[dim_no]; } - for(; dim_no < static_cast(_axis) + indices_num_dims; ++dim_no) + for (; dim_no < static_cast(_axis) + indices_num_dims; ++dim_no) { _idx_it_strides[dim_no] = indices_strides[dim_no - _axis]; } - for(; dim_no < Coordinates::num_max_dimensions; ++dim_no) + for (; dim_no < Coordinates::num_max_dimensions; ++dim_no) { _src_it_strides[dim_no] = input_strides[dim_no - indices_num_dims + 1]; } } -Status NEGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis) +Status +NEGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis)); return Status{}; diff --git a/src/core/NEON/kernels/NEGatherKernel.h b/src/core/NEON/kernels/NEGatherKernel.h index ce69daeda7..b8c069f99e 100644 --- a/src/core/NEON/kernels/NEGatherKernel.h +++ b/src/core/NEON/kernels/NEGatherKernel.h @@ -26,6 +26,7 @@ #define ARM_COMPUTE_NEGATHERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -92,8 +93,8 @@ private: ITensor *_output; kernel_ptr _func; - Strides _src_it_strides; - Strides _idx_it_strides; + Strides _src_it_strides; + Strides _idx_it_strides; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEGATHERKERNEL_H */ diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp index 7bba136e84..549319e49f 100644 --- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp +++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp @@ -27,11 +27,13 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Window.h" -#include "src/core/CPP/Validate.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/genproposals/list.h" + #include namespace arm_compute @@ -44,7 +46,8 @@ struct ComputeAllAnchorsData }; using ComputeAllAnchorsSelectorPtr = std::add_pointer::type; -using ComputeAllAnchorsUKernelPtr = std::add_pointer::type; +using ComputeAllAnchorsUKernelPtr = std::add_pointer::type; struct ComputeAllAnchorsKernel { @@ -53,27 +56,17 @@ struct ComputeAllAnchorsKernel ComputeAllAnchorsUKernelPtr ukernel; }; -static const ComputeAllAnchorsKernel available_kernels[] = -{ +static const ComputeAllAnchorsKernel available_kernels[] = { #if defined(ARM_COMPUTE_ENABLE_NEON) - { - "neon_qu16_computeallanchors", - [](const ComputeAllAnchorsData & data) { return data.dt == DataType::QSYMM16; }, - REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors) - }, + {"neon_qu16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::QSYMM16; }, + REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors)}, #endif //defined(ARM_COMPUTE_ENABLE_NEON) #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { - "neon_fp16_computeallanchors", - [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors) - }, + {"neon_fp16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors)}, #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { - "neon_fp32_computeallanchors", - [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors) - }, + {"neon_fp32_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors)}, }; /** Micro-kernel selector @@ -84,9 +77,9 @@ static const ComputeAllAnchorsKernel available_kernels[] = */ const ComputeAllAnchorsKernel *get_implementation(const ComputeAllAnchorsData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -101,7 +94,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi()); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::QSYMM16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2); - if(all_anchors->total_size() > 0) + if (all_anchors->total_size() > 0) { const size_t feature_height = info.feat_height(); const size_t feature_width = info.feat_width(); @@ -111,7 +104,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(0) != info.values_per_roi()); ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(1) != feature_height * feature_width * num_anchors); - if(is_data_type_quantized(anchors->data_type())) + if (is_data_type_quantized(anchors->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(anchors, all_anchors); } @@ -139,7 +132,8 @@ void NEComputeAllAnchorsKernel::configure(const ITensor *anchors, ITensor *all_a // Initialize the output if empty const TensorShape output_shape(info.values_per_roi(), width * height * num_anchors); - auto_init_if_empty(*all_anchors->info(), TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info())); + auto_init_if_empty(*all_anchors->info(), + TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info())); // Set instance variables _anchors = anchors; @@ -151,7 +145,9 @@ void NEComputeAllAnchorsKernel::configure(const ITensor *anchors, ITensor *all_a INEKernel::configure(win); } -Status NEComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info) +Status NEComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, + const ITensorInfo *all_anchors, + const ComputeAnchorsInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(anchors, all_anchors, info)); return Status{}; @@ -163,7 +159,7 @@ void NEComputeAllAnchorsKernel::run(const Window &window, const ThreadInfo &info ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - const auto *uk = get_implementation(ComputeAllAnchorsData{ _anchors->info()->data_type() }); + const auto *uk = get_implementation(ComputeAllAnchorsData{_anchors->info()->data_type()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); uk->ukernel(_anchors, _all_anchors, _anchors_info, window); diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h index 297d6d4abe..30699eee01 100644 --- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h +++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h @@ -78,5 +78,5 @@ private: ITensor *_all_anchors; ComputeAnchorsInfo _anchors_info; }; -} // arm_compute +} // namespace arm_compute #endif // ARM_COMPUTE_NEGENERATEPROPOSALSLAYERKERNEL_H diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp index 71641404bf..0a1780f6ee 100644 --- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp @@ -31,12 +31,13 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/instancenorm/list.h" #include @@ -51,7 +52,13 @@ struct InstanceNormSelectorData }; using InstanceNormSelctorPtr = std::add_pointer::type; -using InstanceNormUKernelPtr = std::add_pointer::type; +using InstanceNormUKernelPtr = std::add_pointer::type; struct InstanceNormKernel { @@ -60,19 +67,12 @@ struct InstanceNormKernel InstanceNormUKernelPtr ukernel; }; -static const InstanceNormKernel available_kernels[] = -{ - { - "fp32_neon_instancenorm", - [](const InstanceNormSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm) - }, +static const InstanceNormKernel available_kernels[] = { + {"fp32_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm)}, #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { - "fp16_neon_instancenorm", - [](const InstanceNormSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm) - }, + {"fp16_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm)}, #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC }; @@ -84,9 +84,9 @@ static const InstanceNormKernel available_kernels[] = */ const InstanceNormKernel *get_implementation(const InstanceNormSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -102,14 +102,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0"); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC, "NHWC data layout is not supported by the kernel directly"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC, + "NHWC data layout is not supported by the kernel directly"); - if(output != nullptr && output->total_size() != 0) + if (output != nullptr && output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), + "Input and output have different number of channels"); } return Status{}; } @@ -132,7 +134,9 @@ NEInstanceNormalizationLayerKernel::NEInstanceNormalizationLayerKernel() { } -void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *output, const InstanceNormalizationLayerKernelInfo &info) +void NEInstanceNormalizationLayerKernel::configure(ITensor *input, + ITensor *output, + const InstanceNormalizationLayerKernelInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); @@ -152,10 +156,13 @@ void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *outp INEKernel::configure(std::get<1>(win_config)); } -Status NEInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info) +Status NEInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const InstanceNormalizationLayerKernelInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info.gamma, info.beta, info.epsilon)); - ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window( + input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get())))); return Status{}; } @@ -165,7 +172,7 @@ void NEInstanceNormalizationLayerKernel::run(const Window &window, const ThreadI ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - const auto *uk = get_implementation(InstanceNormSelectorData{ _input->info()->data_type() }); + const auto *uk = get_implementation(InstanceNormSelectorData{_input->info()->data_type()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); uk->ukernel(_input, _output, _gamma, _beta, _epsilon, _use_mixed_precision, window); diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h index f166ce2058..024ccd9ef2 100644 --- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h +++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h @@ -68,7 +68,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -82,14 +83,15 @@ private: * @param[in] beta The offset scalar value applied to the normalized tensor. Defaults to 0.0 * @param[in] epsilon Lower bound value for the normalization. Defaults to 1e-12 */ - using NormalizationFunction = void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window); + using NormalizationFunction = + void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window); ITensor *_input; ITensor *_output; float _gamma; float _beta; float _epsilon; - bool _use_mixed_precision{ true }; + bool _use_mixed_precision{true}; }; } // namespace arm_compute #endif /*ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H */ diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp index 8ab0288ab1..eea57a17d3 100644 --- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp +++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp @@ -30,11 +30,12 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/common/cpuinfo/CpuIsaInfo.h" -#include "src/core/NEON/NEMath.h" #include "src/core/common/Registrars.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEMath.h" #include "src/cpu/kernels/l2normlayer/list.h" #include @@ -55,7 +56,8 @@ struct L2NormalizeLayerSelectorData using L2NormalizeLayerKernelSelctorPtr = std::add_pointer::type; -using L2NormalizeLayerPtr = std::add_pointer::type; +using L2NormalizeLayerPtr = std::add_pointer::type; struct L2NormalizeLayerKernel { @@ -64,26 +66,25 @@ struct L2NormalizeLayerKernel L2NormalizeLayerPtr ukernel; }; -static const L2NormalizeLayerKernel available_kernels[] = -{ - { - "fp32_neon_l2normalize_x", - [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F32 && data.actual_axis == Window::DimX; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_x) - }, - { - "fp32_neon_l2normalize_yz", - [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F32 && data.actual_axis != Window::DimX; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_yz) - }, +static const L2NormalizeLayerKernel available_kernels[] = { + {"fp32_neon_l2normalize_x", + [](const L2NormalizeLayerSelectorData &data) + { return data.dt == DataType::F32 && data.actual_axis == Window::DimX; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_x)}, + {"fp32_neon_l2normalize_yz", + [](const L2NormalizeLayerSelectorData &data) + { return data.dt == DataType::F32 && data.actual_axis != Window::DimX; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_yz)}, { "fp16_neon_l2normalize_x", - [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis == Window::DimX; }, + [](const L2NormalizeLayerSelectorData &data) + { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis == Window::DimX; }, REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_l2_normalize_x), }, { "fp16_neon_l2normalize_yz", - [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis != Window::DimX; }, + [](const L2NormalizeLayerSelectorData &data) + { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis != Window::DimX; }, REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_l2_normalize_yz), }, }; @@ -96,9 +97,9 @@ static const L2NormalizeLayerKernel available_kernels[] = */ const L2NormalizeLayerKernel *get_implementation(const L2NormalizeLayerSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -106,7 +107,8 @@ const L2NormalizeLayerKernel *get_implementation(const L2NormalizeLayerSelectorD return nullptr; } -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) +Status +validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) { ARM_COMPUTE_UNUSED(epsilon); @@ -115,14 +117,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis > 2, "Actual axis greater than 2 is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, "Actual normalization axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, + "Actual normalization axis greater than max number of dimensions"); // Reduce shape on axis TensorShape sum_shape = input->tensor_shape(); sum_shape.set(actual_axis, 1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -151,7 +154,8 @@ NEL2NormalizeLayerKernel::NEL2NormalizeLayerKernel() { } -void NEL2NormalizeLayerKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output, int axis, float epsilon) +void NEL2NormalizeLayerKernel::configure( + const ITensor *input, const ITensor *sum, ITensor *output, int axis, float epsilon) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon)); @@ -169,10 +173,12 @@ void NEL2NormalizeLayerKernel::configure(const ITensor *input, const ITensor *su INEKernel::configure(std::get<1>(win_config)); } -Status NEL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) +Status NEL2NormalizeLayerKernel::validate( + const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon)); - ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); + ARM_COMPUTE_RETURN_ON_ERROR( + std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get()))); return Status{}; } @@ -183,12 +189,13 @@ void NEL2NormalizeLayerKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - if(_actual_axis > 2) + if (_actual_axis > 2) { ARM_COMPUTE_ERROR("Unsupported normalization axis"); } - const auto *uk = get_implementation(L2NormalizeLayerSelectorData{ _output->info()->data_type(), _actual_axis, CPUInfo::get().get_isa() }); + const auto *uk = get_implementation( + L2NormalizeLayerSelectorData{_output->info()->data_type(), _actual_axis, CPUInfo::get().get_isa()}); ARM_COMPUTE_ERROR_ON(uk == nullptr); ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr); diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h index af3ad3403e..3524e66a21 100644 --- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h +++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h @@ -74,7 +74,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon); + static Status + validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NELogicalKernel.cpp b/src/core/NEON/kernels/NELogicalKernel.cpp index 6939e08ef0..6be6284528 100644 --- a/src/core/NEON/kernels/NELogicalKernel.cpp +++ b/src/core/NEON/kernels/NELogicalKernel.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" + #include "src/common/utils/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -50,7 +51,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, ui ARM_COMPUTE_ASSERT_NOT_NULLPTR(src1); ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst); - for(; len >= step; len -= step) + for (; len >= step; len -= step) { vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16))); src0 += step; @@ -58,7 +59,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, ui dst += step; } - for(; len >= half_step; len -= half_step) + for (; len >= half_step; len -= half_step) { vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8))); src0 += half_step; @@ -66,7 +67,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, ui dst += half_step; } - for(; len > 0; --len) + for (; len > 0; --len) { *dst = (*src0) && (*src1); ++src0; @@ -84,21 +85,21 @@ void neon_logical_and_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8 const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s); const auto broadcast_val_clamped_x8 = vdup_n_u8(broadcast_val_clamped_s); - for(; len >= step; len -= step) + for (; len >= step; len -= step) { vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16)); src += step; dst += step; } - for(; len >= half_step; len -= half_step) + for (; len >= half_step; len -= half_step) { vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8)); src += half_step; dst += half_step; } - for(; len > 0; --len) + for (; len > 0; --len) { *dst = (*src) && broadcast_val_clamped_s; ++src; @@ -112,7 +113,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uin ARM_COMPUTE_ASSERT_NOT_NULLPTR(src1); ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst); - for(; len >= step; len -= step) + for (; len >= step; len -= step) { vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16))); src0 += step; @@ -120,7 +121,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uin dst += step; } - for(; len >= half_step; len -= half_step) + for (; len >= half_step; len -= half_step) { vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8))); src0 += half_step; @@ -128,7 +129,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uin dst += half_step; } - for(; len > 0; --len) + for (; len > 0; --len) { *dst = (*src0) || (*src1); ++src0; @@ -146,21 +147,21 @@ void neon_logical_or_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8_ const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s); const auto broadcast_val_clamped_x8 = vdup_n_u8(broadcast_val_clamped_s); - for(; len >= step; len -= step) + for (; len >= step; len -= step) { vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16)); src += step; dst += step; } - for(; len >= half_step; len -= half_step) + for (; len >= half_step; len -= half_step) { vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8)); src += half_step; dst += half_step; } - for(; len > 0; --len) + for (; len > 0; --len) { *dst = (*src) || broadcast_val_clamped_s; ++src; @@ -173,21 +174,21 @@ void neon_logical_not(const uint8_t *src, uint8_t *dst, uint32_t len) ARM_COMPUTE_ASSERT_NOT_NULLPTR(src); ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst); - for(; len >= step; len -= step) + for (; len >= step; len -= step) { vst1q_u8(dst, vbslq_u8(vceqq_u8(vld1q_u8(src), c0_x16), c1_x16, c0_x16)); src += step; dst += step; } - for(; len >= half_step; len -= half_step) + for (; len >= half_step; len -= half_step) { vst1_u8(dst, vbsl_u8(vceq_u8(vld1_u8(src), c0_x8), c1_x8, c0_x8)); src += half_step; dst += half_step; } - for(; len > 0; --len) + for (; len > 0; --len) { *dst = !(*src); ++src; @@ -197,18 +198,15 @@ void neon_logical_not(const uint8_t *src, uint8_t *dst, uint32_t len) void run_unary(const Window &window, const ITensor *src, ITensor *dst) { - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); const auto len = window.x().end() - window.x().start(); Iterator in(src, win); Iterator out(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - neon_logical_not(in.ptr(), out.ptr(), len); - }, - in, out); + execute_window_loop( + win, [&](const Coordinates &) { neon_logical_not(in.ptr(), out.ptr(), len); }, in, out); } void run_binary(const Window &window, const ITensor *src0, const ITensor *src1, ITensor *dst, LogicalOperation op) @@ -216,16 +214,17 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1, Window src0_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()); Window src1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()); - Window win{ window }; + Window win{window}; win.set(Window::DimX, Window::Dimension(0, 1, 1)); const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x(); const auto len = window.x().end() - window.x().start(); - if(is_broadcast_across_x) + if (is_broadcast_across_x) { - using LogicalBroadcastUKernelPtr = std::add_pointer::type; - LogicalBroadcastUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or_broadcast : &neon_logical_and_broadcast; + using LogicalBroadcastUKernelPtr = std::add_pointer::type; + LogicalBroadcastUKernelPtr logical_func = + op == LogicalOperation::Or ? &neon_logical_or_broadcast : &neon_logical_and_broadcast; const bool is_broadcast_input_1 = src1_win.x().step() == 0; Window broadcast_win = is_broadcast_input_1 ? src1_win : src0_win; @@ -238,17 +237,18 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1, Iterator non_broadcast_in(non_broadcast_tensor, non_broadcast_win); Iterator out(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - const uint8_t broadcast_value = *broadcast_in.ptr(); - logical_func(non_broadcast_in.ptr(), broadcast_value, out.ptr(), len); - - }, - broadcast_in, non_broadcast_in, out); + execute_window_loop( + win, + [&](const Coordinates &) + { + const uint8_t broadcast_value = *broadcast_in.ptr(); + logical_func(non_broadcast_in.ptr(), broadcast_value, out.ptr(), len); + }, + broadcast_in, non_broadcast_in, out); } else { - using LogicalUKernelPtr = std::add_pointer::type; + using LogicalUKernelPtr = std::add_pointer::type; LogicalUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or : &neon_logical_and; src0_win.set(Window::DimX, Window::Dimension(0, 1, 1)); @@ -257,11 +257,8 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1, Iterator in0(src0, src0_win); Iterator in1(src1, src1_win); Iterator out(dst, win); - execute_window_loop(win, [&](const Coordinates &) - { - logical_func(in0.ptr(), in1.ptr(), out.ptr(), len); - }, - in0, in1, out); + execute_window_loop( + win, [&](const Coordinates &) { logical_func(in0.ptr(), in1.ptr(), out.ptr(), len); }, in0, in1, out); } } } // namespace @@ -270,7 +267,10 @@ const char *NELogicalKernel::name() const return "NELogicalKernel"; } -void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, LogicalOperation op) +void NELogicalKernel::configure(const ITensorInfo *input1, + const ITensorInfo *input2, + ITensorInfo *output, + LogicalOperation op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, output); ARM_COMPUTE_ERROR_THROW_ON(validate(input1, input2, output, op)); @@ -279,7 +279,7 @@ void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *in Window win = calculate_max_window(*input1, Steps()); TensorShape out_shape = input1->tensor_shape(); - if(op != LogicalOperation::Not) + if (op != LogicalOperation::Not) { ARM_COMPUTE_ERROR_ON_NULLPTR(input2); out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); @@ -292,13 +292,16 @@ void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *in set_data_type_if_unknown(*output, input1->data_type()); } -Status NELogicalKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op) +Status NELogicalKernel::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + LogicalOperation op) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8); ARM_COMPUTE_RETURN_ERROR_ON(op == LogicalOperation::Unknown); TensorShape out_shape = input1->tensor_shape(); - if(op != LogicalOperation::Not) + if (op != LogicalOperation::Not) { out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); @@ -306,7 +309,7 @@ Status NELogicalKernel::validate(const ITensorInfo *input1, const ITensorInfo *i } // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); @@ -326,7 +329,7 @@ void NELogicalKernel::run_op(ITensorPack &tensors, const Window &window, const T const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1); ITensor *dst = tensors.get_tensor(TensorType::ACL_DST); - if(_op == LogicalOperation::Not) + if (_op == LogicalOperation::Not) { run_unary(window, src0, dst); } diff --git a/src/core/NEON/kernels/NELogicalKernel.h b/src/core/NEON/kernels/NELogicalKernel.h index caf69cf45d..477a59d826 100644 --- a/src/core/NEON/kernels/NELogicalKernel.h +++ b/src/core/NEON/kernels/NELogicalKernel.h @@ -58,10 +58,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op); + static Status + validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; + void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; const char *name() const override; private: diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp index 37e88a8565..451031d696 100644 --- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp +++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp @@ -28,12 +28,13 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Window.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/meanstddevnorm/list.h" namespace arm_compute @@ -46,7 +47,8 @@ struct MeanStdDevNormSelectorData }; using MeanStdDevNormSelctorPtr = std::add_pointer::type; -using MeanStdDevNormUKernelPtr = std::add_pointer::type; +using MeanStdDevNormUKernelPtr = + std::add_pointer::type; struct MeanStdDevNormKernel { @@ -55,25 +57,15 @@ struct MeanStdDevNormKernel MeanStdDevNormUKernelPtr ukernel; }; -static const std::vector available_kernels = -{ - { - "fp32_neon_meanstddevnorm", - [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm) - }, +static const std::vector available_kernels = { + {"fp32_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm)}, #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { - "fp16_neon_meanstddevnorm", - [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm) - }, + {"fp16_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm)}, #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { - "qasymm8_neon_meanstddevnorm", - [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_meanstddevnorm) - }, + {"qasymm8_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_meanstddevnorm)}, }; /** Micro-kernel selector @@ -84,9 +76,9 @@ static const std::vector available_kernels = */ const MeanStdDevNormKernel *get_implementation(const MeanStdDevNormSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -103,7 +95,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -113,7 +105,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) { - if(output != nullptr) + if (output != nullptr) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Output auto inizialitation if not yet initialized @@ -128,8 +120,7 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen } } // namespace -NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel() - : _input(nullptr), _output(nullptr), _epsilon(1e-8f) +NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel() : _input(nullptr), _output(nullptr), _epsilon(1e-8f) { } @@ -137,7 +128,8 @@ void NEMeanStdDevNormalizationKernel::configure(ITensor *input, ITensor *output, { ARM_COMPUTE_ERROR_ON_NULLPTR(input); - ARM_COMPUTE_ERROR_THROW_ON(NEMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon)); + ARM_COMPUTE_ERROR_THROW_ON(NEMeanStdDevNormalizationKernel::validate( + input->info(), (output != nullptr) ? output->info() : nullptr, epsilon)); _input = input; _output = (output == nullptr) ? input : output; @@ -152,7 +144,9 @@ void NEMeanStdDevNormalizationKernel::configure(ITensor *input, ITensor *output, Status NEMeanStdDevNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float epsilon) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, epsilon)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr) + .first); return Status{}; } @@ -162,7 +156,7 @@ void NEMeanStdDevNormalizationKernel::run(const Window &window, const ThreadInfo ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - const auto *uk = get_implementation(MeanStdDevNormSelectorData{ _output->info()->data_type() }); + const auto *uk = get_implementation(MeanStdDevNormSelectorData{_output->info()->data_type()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); uk->ukernel(_input, _output, _epsilon, window); diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp index 49a045382d..2c61bda147 100644 --- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp @@ -29,19 +29,23 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEFixedPoint.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/wrapper/wrapper.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/NormalizationHelpers.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo &norm_info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *input_squared, + const ITensorInfo *output, + const NormalizationLayerInfo &norm_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_squared, output); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); @@ -52,7 +56,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd"); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); @@ -69,7 +73,10 @@ NENormalizationLayerKernel::NENormalizationLayerKernel() { } -void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info) +void NENormalizationLayerKernel::configure(const ITensor *input, + const ITensor *input_squared, + ITensor *output, + NormalizationLayerInfo norm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_squared, output); // Output tensor auto initialization if not yet initialized @@ -85,15 +92,15 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor * _output = output; _norm_info = norm_info; - switch(_input->info()->data_type()) + switch (_input->info()->data_type()) { case DataType::F32: { - switch(norm_idx) + switch (norm_idx) { case 0: { - if(norm_info.type() == NormType::IN_MAP_2D) + if (norm_info.type() == NormType::IN_MAP_2D) { _func = &NENormalizationLayerKernel::normalize_float; } @@ -104,7 +111,7 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor * break; } case 1: - if(norm_info.type() == NormType::IN_MAP_2D) + if (norm_info.type() == NormType::IN_MAP_2D) { _func = &NENormalizationLayerKernel::normalize_float; } @@ -124,11 +131,11 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor * #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: { - switch(norm_idx) + switch (norm_idx) { case 0: { - if(norm_info.type() == NormType::IN_MAP_2D) + if (norm_info.type() == NormType::IN_MAP_2D) { _func = &NENormalizationLayerKernel::normalize_float; } @@ -139,7 +146,7 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor * break; } case 1: - if(norm_info.type() == NormType::IN_MAP_2D) + if (norm_info.type() == NormType::IN_MAP_2D) { _func = &NENormalizationLayerKernel::normalize_float; } @@ -196,8 +203,9 @@ void NENormalizationLayerKernel::normalize_float(const Window &window) const auto beta_vec = wrapper::vdup_n(static_cast(_norm_info.beta()), ExactTagType{}); const auto kappa_vec = wrapper::vdup_n(static_cast(_norm_info.kappa()), ExactTagType{}); - auto sequential_normalization = [&](const int x, const Coordinates & id, const int current_row, const int first_row, const int last_row, const T * input_ptr, const uint8_t *input_squared_start_ptr, - T * output_ptr) + auto sequential_normalization = [&](const int x, const Coordinates &id, const int current_row, const int first_row, + const int last_row, const T *input_ptr, const uint8_t *input_squared_start_ptr, + T *output_ptr) { const int current_slice = dim == 0 ? x : id[dim]; const int first_slice = std::max(current_slice - radius, 0); @@ -206,75 +214,87 @@ void NENormalizationLayerKernel::normalize_float(const Window &window) const uint8_t *const input_squared_x_ptr = input_squared_start_ptr + x * input_squared_stride_x; // Accumulate 2D In-Map values auto accu = static_cast(0.f); - for(int j = first_row; j <= last_row; ++j) + for (int j = first_row; j <= last_row; ++j) { // Compute row displacement const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row; - for(int i = first_slice; i <= last_slice; ++i) + for (int i = first_slice; i <= last_slice; ++i) { - accu += *reinterpret_cast(input_squared_ptr + (i - current_slice) * input_squared_stride_slice); + accu += + *reinterpret_cast(input_squared_ptr + (i - current_slice) * input_squared_stride_slice); } } // Normalize - const auto normalized = std::pow(accu * static_cast(_norm_info.scale_coeff()) + static_cast(_norm_info.kappa()), _norm_info.beta()); + const auto normalized = std::pow( + accu * static_cast(_norm_info.scale_coeff()) + static_cast(_norm_info.kappa()), _norm_info.beta()); const auto normalized_pixel = (*(input_ptr + x)) / normalized; *(output_ptr + x) = normalized_pixel; }; - execute_window_loop(win, [&](const Coordinates & id) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - auto output_ptr = reinterpret_cast(output.ptr()); + execute_window_loop( + win, + [&](const Coordinates &id) + { + const auto input_ptr = reinterpret_cast(input.ptr()); + auto output_ptr = reinterpret_cast(output.ptr()); - // Get range to normalize - const int current_row = do_2D_norm ? id[dim_y] : 0; - const int first_row = do_2D_norm ? std::max(current_row - radius, 0) : 0; - const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0; + // Get range to normalize + const int current_row = do_2D_norm ? id[dim_y] : 0; + const int first_row = do_2D_norm ? std::max(current_row - radius, 0) : 0; + const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0; - int x = window_start_x; - // Compute serially starting elements for the case x dimension is width - for(; x < radius && x < window_end_x && dim == 0; ++x) - { - sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), output_ptr); - } + int x = window_start_x; + // Compute serially starting elements for the case x dimension is width + for (; x < radius && x < window_end_x && dim == 0; ++x) + { + sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), + output_ptr); + } - // Compute vectorized - for(; x <= window_end_x - window_step_x - radius; x += window_step_x) - { - const int current_slice = dim == 0 ? x : id[dim]; - const int first_slice = std::max(current_slice - radius, 0); - const int last_slice = std::min(current_slice + radius, max_right); - - const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x; - // Accumulate 2D In-Map values - auto accu = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - for(int j = first_row; j <= last_row; ++j) + // Compute vectorized + for (; x <= window_end_x - window_step_x - radius; x += window_step_x) { - // Compute row displacement - const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row; - for(int i = first_slice; i <= last_slice; ++i) + const int current_slice = dim == 0 ? x : id[dim]; + const int first_slice = std::max(current_slice - radius, 0); + const int last_slice = std::min(current_slice + radius, max_right); + + const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x; + // Accumulate 2D In-Map values + auto accu = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); + for (int j = first_row; j <= last_row; ++j) { - accu = wrapper::vadd(accu, wrapper::vloadq(reinterpret_cast(input_squared_ptr + (i - current_slice) * input_squared_stride_slice))); + // Compute row displacement + const uint8_t *const input_squared_ptr = + input_squared_x_ptr + (j - current_row) * input_squared_stride_row; + for (int i = first_slice; i <= last_slice; ++i) + { + accu = wrapper::vadd( + accu, wrapper::vloadq(reinterpret_cast( + input_squared_ptr + (i - current_slice) * input_squared_stride_slice))); + } } - } - // Normalize - const auto normalized = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec); - const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized)); - wrapper::vstore(reinterpret_cast(output_ptr + x), normalized_pixel); - } + // Normalize + const auto normalized = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec); + const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized)); + wrapper::vstore(reinterpret_cast(output_ptr + x), normalized_pixel); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), output_ptr); - } - }, - input, input_squared, output); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), + output_ptr); + } + }, + input, input_squared, output); } -Status NENormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo norm_info) +Status NENormalizationLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *input_squared, + const ITensorInfo *output, + const NormalizationLayerInfo norm_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, input_squared, output, norm_info)); diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.h b/src/core/NEON/kernels/NENormalizationLayerKernel.h index 53a06b9ed9..2d8d9f3d60 100644 --- a/src/core/NEON/kernels/NENormalizationLayerKernel.h +++ b/src/core/NEON/kernels/NENormalizationLayerKernel.h @@ -60,7 +60,8 @@ public: * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type and layout supported: same as @p input. * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters. */ - void configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info); + void + configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info); /** Static function to check if given info will lead to a valid configuration of @ref NENormalizationLayerKernel * * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], @@ -72,7 +73,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, NormalizationLayerInfo norm_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *input_squared, + const ITensorInfo *output, + NormalizationLayerInfo norm_info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEPadLayerKernel.cpp b/src/core/NEON/kernels/NEPadLayerKernel.cpp index 734510b637..c9bcbc9127 100644 --- a/src/core/NEON/kernels/NEPadLayerKernel.cpp +++ b/src/core/NEON/kernels/NEPadLayerKernel.cpp @@ -28,26 +28,31 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &paddings, const PaddingMode mode) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &paddings, + const PaddingMode mode) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_MSG(mode != PaddingMode::CONSTANT, "Only constant padding mode is supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(paddings.size() > 4, "Padding list bigger than 4 dimensions"); - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input->tensor_shape(), paddings); - const TensorInfo expected_output_info = input->clone()->set_tensor_shape(expected_output_shape); + const TensorShape expected_output_shape = + arm_compute::misc::shape_calculator::compute_padded_shape(input->tensor_shape(), paddings); + const TensorInfo expected_output_info = input->clone()->set_tensor_shape(expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -58,30 +63,34 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c template void NEPadLayerKernel::run_pad_constant(const Window &window) { - Window output_window{ window }; + Window output_window{window}; output_window.set(Window::DimX, Window::Dimension(0, 1, 1)); const size_t element_size = _input->info()->element_size(); Iterator output_it(_output, output_window); - execute_window_loop(output_window, [&](const Coordinates & id) - { - Coordinates idin{ id }; - for(size_t dim = _padding.size() - 1; dim > 0; --dim) + execute_window_loop( + output_window, + [&](const Coordinates &id) { - idin[dim] -= _padding[dim].first; - if(idin[dim] < 0 || static_cast(_input->info()->dimension(dim)) - 1 < idin[dim]) + Coordinates idin{id}; + for (size_t dim = _padding.size() - 1; dim > 0; --dim) { - std::fill_n(reinterpret_cast(output_it.ptr()), _output->info()->dimension(0), _constant_value.get()); - return; + idin[dim] -= _padding[dim].first; + if (idin[dim] < 0 || static_cast(_input->info()->dimension(dim)) - 1 < idin[dim]) + { + std::fill_n(reinterpret_cast(output_it.ptr()), _output->info()->dimension(0), + _constant_value.get()); + return; + } } - } - T *input_it_ptr = reinterpret_cast(_input->ptr_to_element(idin)); - T *output_it_ptr = reinterpret_cast(output_it.ptr()); - std::fill_n(output_it_ptr, _padding[0].first, _constant_value.get()); - memcpy(output_it_ptr + _padding[0].first, input_it_ptr, _input->info()->dimension(0) * element_size); - std::fill_n(output_it_ptr + _padding[0].first + _input->info()->dimension(0), _padding[0].second, _constant_value.get()); - }, - output_it); + T *input_it_ptr = reinterpret_cast(_input->ptr_to_element(idin)); + T *output_it_ptr = reinterpret_cast(output_it.ptr()); + std::fill_n(output_it_ptr, _padding[0].first, _constant_value.get()); + memcpy(output_it_ptr + _padding[0].first, input_it_ptr, _input->info()->dimension(0) * element_size); + std::fill_n(output_it_ptr + _padding[0].first + _input->info()->dimension(0), _padding[0].second, + _constant_value.get()); + }, + output_it); } void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window) @@ -92,7 +101,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window const size_t end_plane = window.z().end(); size_t start_plane_input = start_plane; - if(_padding.size() > 2) + if (_padding.size() > 2) { start_plane_input = (start_plane < _padding[2].first) ? 0 : start_plane - _padding[2].first; } @@ -105,18 +114,20 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window const size_t jump_to_next_row_input = _input->info()->dimension(0); const size_t jump_to_next_row_output = _padding[0].first + _padding[0].second; - uint8_t *output_row_ptr = _output->buffer() + _output->info()->offset_first_element_in_bytes() + start_plane * output_plane_size; - const uint8_t *input_it_ptr = _input->buffer() + _input->info()->offset_first_element_in_bytes() + start_plane_input * input_plane_size; - const auto pad_value = _constant_value.get(); + uint8_t *output_row_ptr = + _output->buffer() + _output->info()->offset_first_element_in_bytes() + start_plane * output_plane_size; + const uint8_t *input_it_ptr = + _input->buffer() + _input->info()->offset_first_element_in_bytes() + start_plane_input * input_plane_size; + const auto pad_value = _constant_value.get(); - for(size_t z_i = start_plane; z_i < end_plane; ++z_i) + for (size_t z_i = start_plane; z_i < end_plane; ++z_i) { - if(_padding.size() > 2 && z_i < _padding[2].first) + if (_padding.size() > 2 && z_i < _padding[2].first) { memset(output_row_ptr, pad_value, output_plane_size); output_row_ptr += output_plane_size; } - else if(_padding.size() > 2 && z_i > (_input->info()->dimension(2) + _padding[2].first - 1)) + else if (_padding.size() > 2 && z_i > (_input->info()->dimension(2) + _padding[2].first - 1)) { memset(output_row_ptr, pad_value, output_plane_size); output_row_ptr += output_plane_size; @@ -127,7 +138,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window output_row_ptr += pad_y_elems_top; size_t y_i = _input->info()->dimension(1); // Basic loop unrolling - for(; y_i > 3; y_i -= 4) + for (; y_i > 3; y_i -= 4) { memset(output_row_ptr, pad_value, _padding[0].first); output_row_ptr += _padding[0].first; @@ -160,7 +171,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window memset(output_row_ptr, pad_value, _padding[0].second); output_row_ptr += _padding[0].second; } - for(; y_i > 0; --y_i) + for (; y_i > 0; --y_i) { memset(output_row_ptr, pad_value, _padding[0].first); output_row_ptr += _padding[0].first; @@ -183,12 +194,17 @@ NEPadLayerKernel::NEPadLayerKernel() { } -void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode) +void NEPadLayerKernel::configure(ITensor *input, + ITensor *output, + const PaddingList &padding, + const PixelValue constant_value, + const PaddingMode mode) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Auto-init - const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding); - const TensorInfo expected_output_info = input->info()->clone()->set_tensor_shape(expected_output_shape); + const TensorShape expected_output_shape = + arm_compute::misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding); + const TensorInfo expected_output_info = input->info()->clone()->set_tensor_shape(expected_output_shape); auto_init_if_empty(*output->info(), expected_output_info); // Perform validation step @@ -200,14 +216,14 @@ void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingL _constant_value = constant_value; _mode = mode; - if(_mode == PaddingMode::CONSTANT) + if (_mode == PaddingMode::CONSTANT) { - switch(_input->info()->element_size()) + switch (_input->info()->element_size()) { case 1: - if(_input->info()->num_dimensions() == 3 && // Is 3D - padding.size() <= 3 && // Has 3D padding - !_input->info()->has_padding() && !_output->info()->has_padding()) // Input & Output have no padding + if (_input->info()->num_dimensions() == 3 && // Is 3D + padding.size() <= 3 && // Has 3D padding + !_input->info()->has_padding() && !_output->info()->has_padding()) // Input & Output have no padding { _func = &NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad; } @@ -240,7 +256,11 @@ void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingL ICPPKernel::configure(win); } -Status NEPadLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode) +Status NEPadLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + const PixelValue constant_value, + const PaddingMode mode) { ARM_COMPUTE_UNUSED(constant_value); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, mode)); @@ -253,7 +273,7 @@ void NEPadLayerKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - if(_func != nullptr) + if (_func != nullptr) { (this->*_func)(window); } @@ -263,7 +283,7 @@ size_t NEPadLayerKernel::get_mws(const CPUInfo &platform, size_t thread_count) c { ARM_COMPUTE_UNUSED(thread_count); ARM_COMPUTE_UNUSED(platform); - + return ICPPKernel::default_mws; } diff --git a/src/core/NEON/kernels/NEPadLayerKernel.h b/src/core/NEON/kernels/NEPadLayerKernel.h index f82af1558a..d432887d2c 100644 --- a/src/core/NEON/kernels/NEPadLayerKernel.h +++ b/src/core/NEON/kernels/NEPadLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEPADLAYERKERNEL_H #include "arm_compute/core/PixelValue.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -62,7 +63,11 @@ public: * @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT. * Only CONSTANT padding mode is currently supported */ - void configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT); + void configure(ITensor *input, + ITensor *output, + const PaddingList &padding, + const PixelValue constant_value = PixelValue(), + const PaddingMode mode = PaddingMode::CONSTANT); /** Static function to check if given info will lead to a valid configuration of @ref NEPadLayer. * * @param[in] input Source tensor info. Data types supported: All. @@ -75,7 +80,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + const PixelValue constant_value = PixelValue(), + const PaddingMode mode = PaddingMode::CONSTANT); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp index 3d89933377..15e933e66e 100644 --- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp +++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -36,7 +37,10 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info) +Status validate_arguments(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32); @@ -45,10 +49,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, // Check variances const int var_size = info.variances().size(); - if(var_size > 1) + if (var_size > 1) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values"); - for(int i = 0; i < var_size; ++i) + for (int i = 0; i < var_size; ++i) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0"); } @@ -56,17 +60,19 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0"); - if(!info.max_sizes().empty()) + if (!info.max_sizes().empty()) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), + "Max and min sizes dimensions should match"); } - for(unsigned int i = 0; i < info.max_sizes().size(); ++i) + for (unsigned int i = 0; i < info.max_sizes().size(); ++i) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], + "Max size should be greater than min size"); } - if(output != nullptr && output->total_size() != 0) + if (output != nullptr && output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); @@ -76,21 +82,26 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, } } // namespace -NEPriorBoxLayerKernel::NEPriorBoxLayerKernel() - : _input1(nullptr), _input2(nullptr), _output(nullptr), _info() +NEPriorBoxLayerKernel::NEPriorBoxLayerKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr), _info() { } -void NEPriorBoxLayerKernel::store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width, - const int height) +void NEPriorBoxLayerKernel::store_coordinates(float *out, + const int offset, + const float center_x, + const float center_y, + const float box_width, + const float box_height, + const int width, + const int height) { float xmin = (center_x - box_width / 2.f) / width; float ymin = (center_y - box_height / 2.f) / height; float xmax = (center_x + box_width / 2.f) / width; float ymax = (center_y + box_height / 2.f) / height; - float32x4_t vec_elements = { xmin, ymin, xmax, ymax }; - if(_info.clip()) + float32x4_t vec_elements = {xmin, ymin, xmax, ymax}; + if (_info.clip()) { static const float32x4_t CONST_0 = vdupq_n_f32(0.f); static const float32x4_t CONST_1 = vdupq_n_f32(1.f); @@ -112,7 +123,7 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window) int img_width = _info.img_size().x; int img_height = _info.img_size().y; - if(img_width == 0 || img_height == 0) + if (img_width == 0 || img_height == 0) { img_width = _input2->info()->dimension(width_idx); img_height = _input2->info()->dimension(height_idx); @@ -120,7 +131,7 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window) float step_x = _info.steps()[0]; float step_y = _info.steps()[1]; - if(step_x == 0.f || step_y == 0.f) + if (step_x == 0.f || step_y == 0.f) { step_x = static_cast(img_width) / layer_width; step_y = static_cast(img_height) / layer_height; @@ -130,74 +141,80 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window) slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2)); Iterator output(_output, slice); - execute_window_loop(slice, [&](const Coordinates & id) - { - float center_x = 0; - float center_y = 0; - int idx = id.x() / (4 * num_priors); - center_x = (static_cast(idx % layer_width) + _info.offset()) * step_x; - center_y = (static_cast(idx / layer_width) + _info.offset()) * step_y; - - float box_width; - float box_height; - int offset = 0; - - auto out = reinterpret_cast(output.ptr()); - for(unsigned int i = 0; i < _info.min_sizes().size(); ++i) + execute_window_loop( + slice, + [&](const Coordinates &id) { - const float min_size = _info.min_sizes().at(i); - box_width = min_size; - box_height = min_size; - store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height); - offset += 4; - - if(!_info.max_sizes().empty()) + float center_x = 0; + float center_y = 0; + int idx = id.x() / (4 * num_priors); + center_x = (static_cast(idx % layer_width) + _info.offset()) * step_x; + center_y = (static_cast(idx / layer_width) + _info.offset()) * step_y; + + float box_width; + float box_height; + int offset = 0; + + auto out = reinterpret_cast(output.ptr()); + for (unsigned int i = 0; i < _info.min_sizes().size(); ++i) { - const float max_size = _info.max_sizes().at(i); - box_width = std::sqrt(min_size * max_size); - box_height = box_width; - + const float min_size = _info.min_sizes().at(i); + box_width = min_size; + box_height = min_size; store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height); offset += 4; - } - // rest of priors - for(auto ar : _info.aspect_ratios()) - { - if(fabs(ar - 1.) < 1e-6) + if (!_info.max_sizes().empty()) { - continue; + const float max_size = _info.max_sizes().at(i); + box_width = std::sqrt(min_size * max_size); + box_height = box_width; + + store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height); + offset += 4; } - box_width = min_size * sqrt(ar); - box_height = min_size / sqrt(ar); + // rest of priors + for (auto ar : _info.aspect_ratios()) + { + if (fabs(ar - 1.) < 1e-6) + { + continue; + } - store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height); - offset += 4; + box_width = min_size * sqrt(ar); + box_height = min_size / sqrt(ar); + + store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height); + offset += 4; + } } - } - // set the variance - out = reinterpret_cast(_output->ptr_to_element(Coordinates(id.x(), 1))); - float32x4_t var; - if(_info.variances().size() == 1) - { - var = vdupq_n_f32(_info.variances().at(0)); - } - else - { - const float32x4_t vars = { _info.variances().at(0), _info.variances().at(1), _info.variances().at(2), _info.variances().at(3) }; - var = vars; - } - for(int i = 0; i < num_priors; ++i) - { - vst1q_f32(out + 4 * i, var); - } - }, - output); + // set the variance + out = reinterpret_cast(_output->ptr_to_element(Coordinates(id.x(), 1))); + float32x4_t var; + if (_info.variances().size() == 1) + { + var = vdupq_n_f32(_info.variances().at(0)); + } + else + { + const float32x4_t vars = {_info.variances().at(0), _info.variances().at(1), _info.variances().at(2), + _info.variances().at(3)}; + var = vars; + } + for (int i = 0; i < num_priors; ++i) + { + vst1q_f32(out + 4 * i, var); + } + }, + output); } -void NEPriorBoxLayerKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info) +void NEPriorBoxLayerKernel::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + const PriorBoxLayerInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); @@ -215,7 +232,10 @@ void NEPriorBoxLayerKernel::configure(const ITensor *input1, const ITensor *inpu INEKernel::configure(win); } -Status NEPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info) +Status NEPriorBoxLayerKernel::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info)); @@ -231,4 +251,4 @@ void NEPriorBoxLayerKernel::run(const Window &window, const ThreadInfo &info) // Run function calculate_prior_boxes(window); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.h b/src/core/NEON/kernels/NEPriorBoxLayerKernel.h index 430a47f9f8..460f80e085 100644 --- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.h +++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.h @@ -67,7 +67,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -84,7 +87,14 @@ private: * @param[in] width Input width. * @param[in] height Input height. */ - void store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width, const int height); + void store_coordinates(float *out, + const int offset, + const float center_x, + const float center_y, + const float box_width, + const float box_height, + const int width, + const int height); /** Function to calculate prior boxes. * * @param[in] window Input region on which to execute the kernel. diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp index 46a0f625ce..8e1ed3a2a5 100644 --- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp +++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp @@ -26,17 +26,17 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" + #include "src/core/CPP/Validate.h" -#include "src/core/NEON/NEFixedPoint.h" -#include "src/core/NEON/NEMath.h" -#include "src/core/NEON/NESymm.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/NESymm.h" #include @@ -72,8 +72,8 @@ inline int64x2x2_t mul_add(const int32x4_t &a, const int32x4_t &b, const int32x4 const int64_t b_3 = vgetlane(b_high, 1); int64x2x2_t result; - const int64x2_t result_0{ a_0 * b_0, a_1 * b_1 }; - const int64x2_t result_1{ a_2 * b_2, a_3 * b_3 }; + const int64x2_t result_0{a_0 * b_0, a_1 * b_1}; + const int64x2_t result_1{a_2 * b_2, a_3 * b_3}; result.val[0] = vadd(vmovl(vgetlow(bias)), result_0); result.val[1] = vadd(vmovl(vgethigh(bias)), result_1); @@ -81,15 +81,17 @@ inline int64x2x2_t mul_add(const int32x4_t &a, const int32x4_t &b, const int32x4 } } // namespace -void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, ITensor *output, const ITensor *weight, const ITensor *bias) +void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, + ITensor *output, + const ITensor *weight, + const ITensor *bias) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, bias, output); ARM_COMPUTE_ERROR_ON(input == output); ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), weight->info(), bias->info())); - static const std::map fn_map = - { - { DataType::QSYMM16, std::mem_fn(&NEQLSTMLayerNormalizationKernel::compute_qsymm16) }, + static const std::map fn_map = { + {DataType::QSYMM16, std::mem_fn(&NEQLSTMLayerNormalizationKernel::compute_qsymm16)}, }; _input = input; @@ -102,10 +104,10 @@ void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, ITensor *o _output->info()->set_quantization_info(compute_output_qinfo()); const UniformQuantizationInfo wq_info = _weight->info()->quantization_info().uniform(); - const Status s = quantization::calculate_quantized_multiplier(wq_info.scale, &_output_multiplier, &_output_shift); + const Status s = quantization::calculate_quantized_multiplier(wq_info.scale, &_output_multiplier, &_output_shift); _output_shift *= -1; - if(!bool(s)) + if (!bool(s)) { _output_multiplier = 0; _output_shift = 0; @@ -134,7 +136,10 @@ Window NEQLSTMLayerNormalizationKernel::configure_window(ITensor *target) return window; } -Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias) +Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *weight, + const ITensorInfo *bias) { ARM_COMPUTE_UNUSED(output, bias, weight, input); @@ -151,7 +156,7 @@ Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().x() != weight->tensor_shape().x()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(weight, bias); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); @@ -182,11 +187,11 @@ inline std::pair NEQLSTMLayerNormalizationKernel::sum_qsymm16( using AccType = int64_t; using InputDataType = int16_t; - AccType sum{ 0 }; - AccType sum_sq{ 0 }; + AccType sum{0}; + AccType sum_sq{0}; int32_t x = _window_start_x; - for(; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x) + for (; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x) { using namespace wrapper; const int16x8_t val = vloadq(input_ptr + x); @@ -216,7 +221,7 @@ inline std::pair NEQLSTMLayerNormalizationKernel::sum_qsymm16( #endif // __aarch64__ } - for(; x < _window_end_x; ++x) + for (; x < _window_end_x; ++x) { const InputDataType val = input_ptr[x]; sum += static_cast(val); @@ -230,7 +235,9 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i int16_t *output_ptr, const int16_t *weight_ptr, const int32_t *bias_ptr, - int32_t mean, int32_t inv_std_mul, int32_t inv_std_shift) + int32_t mean, + int32_t inv_std_mul, + int32_t inv_std_shift) { using OutputDataType = int16_t; @@ -238,7 +245,7 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i const int32x4_t mean_vec = vdup_n(mean, wrapper::traits::vector_128_tag{}); int32_t x = _window_start_x; - for(; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x) + for (; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x) { const int16x8_t val = vloadq(input_ptr + x); int32x4x2_t shifted; @@ -267,16 +274,18 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i vstore(output_ptr + x + 4, vqmovn(out_val.val[1])); } - for(; x < _window_end_x; ++x) + for (; x < _window_end_x; ++x) { - const auto val = static_cast(input_ptr[x]); - const int32_t shifted = (val << 10) - mean; - const int32_t rescaled = quantization::multiply_by_quantized_multiplier(shifted, inv_std_mul, inv_std_shift); - const int64_t weighted = rescaled * weight_ptr[x] + bias_ptr[x]; + const auto val = static_cast(input_ptr[x]); + const int32_t shifted = (val << 10) - mean; + const int32_t rescaled = quantization::multiply_by_quantized_multiplier(shifted, inv_std_mul, inv_std_shift); + const int64_t weighted = rescaled * weight_ptr[x] + bias_ptr[x]; const auto reverse_shifted = static_cast((weighted + 512) >> 10); - int32_t out_val = quantization::multiply_by_quantized_multiplier(reverse_shifted, _output_multiplier, _output_shift + 12); - out_val = utility::clamp(out_val, std::numeric_limits::min()); - output_ptr[x] = static_cast(out_val); + int32_t out_val = + quantization::multiply_by_quantized_multiplier(reverse_shifted, _output_multiplier, _output_shift + 12); + out_val = + utility::clamp(out_val, std::numeric_limits::min()); + output_ptr[x] = static_cast(out_val); } } @@ -287,35 +296,38 @@ void NEQLSTMLayerNormalizationKernel::compute_qsymm16() using BiasDataType = int32_t; using AccType = int64_t; - Iterator input_iterator{ _input, _inout_window }; - Iterator output_iterator{ _output, _inout_window }; - Iterator weight_iterator{ _weight, _weight_window }; - Iterator bias_iterator{ _bias, _weight_window }; + Iterator input_iterator{_input, _inout_window}; + Iterator output_iterator{_output, _inout_window}; + Iterator weight_iterator{_weight, _weight_window}; + Iterator bias_iterator{_bias, _weight_window}; const auto weight_ptr = reinterpret_cast(weight_iterator.ptr()); const auto bias_ptr = reinterpret_cast(bias_iterator.ptr()); const uint32_t column_size = _input->info()->tensor_shape()[0]; - execute_window_loop(_inout_window, [ &, this](const Coordinates &) - { - const auto in_ptr = reinterpret_cast(input_iterator.ptr()); - auto out_ptr = reinterpret_cast(output_iterator.ptr()); - - AccType sum{ 0 }; - AccType sum_sq{ 0 }; - std::tie(sum, sum_sq) = sum_qsymm16(in_ptr); - - AccType mean{ 0 }; - AccType variance{ 0 }; - std::tie(mean, variance) = compute_mean_variance(sum, sum_sq, column_size); - - int32_t stddev_invsqrt_mul{}; - int32_t stddev_invsqrt_shift{}; - quantization::get_invsqrt_quantized_multiplier_exp(static_cast(variance), -1, stddev_invsqrt_mul, stddev_invsqrt_shift); - - normalize_qasymm16(in_ptr, out_ptr, weight_ptr, bias_ptr, mean, stddev_invsqrt_mul, stddev_invsqrt_shift); - }, - input_iterator, output_iterator); + execute_window_loop( + _inout_window, + [&, this](const Coordinates &) + { + const auto in_ptr = reinterpret_cast(input_iterator.ptr()); + auto out_ptr = reinterpret_cast(output_iterator.ptr()); + + AccType sum{0}; + AccType sum_sq{0}; + std::tie(sum, sum_sq) = sum_qsymm16(in_ptr); + + AccType mean{0}; + AccType variance{0}; + std::tie(mean, variance) = compute_mean_variance(sum, sum_sq, column_size); + + int32_t stddev_invsqrt_mul{}; + int32_t stddev_invsqrt_shift{}; + quantization::get_invsqrt_quantized_multiplier_exp(static_cast(variance), -1, stddev_invsqrt_mul, + stddev_invsqrt_shift); + + normalize_qasymm16(in_ptr, out_ptr, weight_ptr, bias_ptr, mean, stddev_invsqrt_mul, stddev_invsqrt_shift); + }, + input_iterator, output_iterator); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h index a3ff6e988f..af5b6a0315 100644 --- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h +++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEQLSTMLAYERNORMALIZATIONKERNEL_H #include "src/core/NEON/INEKernel.h" + #include namespace arm_compute @@ -69,34 +70,26 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; private: // constants - static constexpr uint32_t max_input_dimension{ 2 }; /**< The maximum input dimension supported */ - static constexpr uint32_t max_weight_dimension{ 1 }; /**< The maximum weight dimension supported */ - static constexpr uint32_t max_bias_dimension{ 1 }; /**< The maximum bias dimension supported */ - static constexpr uint32_t vector_size_byte{ 16 }; /**< Computation vector size in byte */ + static constexpr uint32_t max_input_dimension{2}; /**< The maximum input dimension supported */ + static constexpr uint32_t max_weight_dimension{1}; /**< The maximum weight dimension supported */ + static constexpr uint32_t max_bias_dimension{1}; /**< The maximum bias dimension supported */ + static constexpr uint32_t vector_size_byte{16}; /**< Computation vector size in byte */ using ComputeFuncType = std::function; ComputeFuncType _fn{}; /**< Function pointer to computation function */ - const ITensor *_input - { - nullptr - }; /**< Input tensor */ - const ITensor *_weight - { - nullptr - }; /**< Weight tensor */ - const ITensor *_bias - { - nullptr - }; /**< Bias tensor */ - ITensor *_output{ nullptr }; /**< Output tensor */ + const ITensor *_input{nullptr}; /**< Input tensor */ + const ITensor *_weight{nullptr}; /**< Weight tensor */ + const ITensor *_bias{nullptr}; /**< Bias tensor */ + ITensor *_output{nullptr}; /**< Output tensor */ int32_t _output_multiplier{}; /**< Multiplier for output values */ int32_t _output_shift{}; /**< Shift value for output values */ @@ -138,7 +131,9 @@ private: int16_t *output_ptr, const int16_t *weight_ptr, const int32_t *bias_ptr, - int32_t mean, int32_t inv_std_mul, int32_t inv_std_shift); + int32_t mean, + int32_t inv_std_mul, + int32_t inv_std_shift); /** Function to compute output quantization information */ QuantizationInfo compute_output_qinfo(); }; diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp index 802aebb526..486cd6d331 100644 --- a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp +++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp @@ -26,11 +26,12 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Window.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/misc/Utility.h" -#include "src/core/CPP/Validate.h" +#include "arm_compute/core/Window.h" + #include "src/core/common/Registrars.h" +#include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/cpu/kernels/roialign/list.h" @@ -49,7 +50,12 @@ struct ROIAlignSelectorData }; using ROIAlignSelctorPtr = std::add_pointer::type; -using ROIAlignUKernelPtr = std::add_pointer::type; +using ROIAlignUKernelPtr = std::add_pointer::type; struct ROIAlignKernel { @@ -58,31 +64,18 @@ struct ROIAlignKernel ROIAlignUKernelPtr ukernel; }; -static const ROIAlignKernel available_kernels[] = -{ - { - "fp32_neon_roialign", - [](const ROIAlignSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_roialign) - }, +static const ROIAlignKernel available_kernels[] = { + {"fp32_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_roialign)}, #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - { - "fp16_neon_roialign", - [](const ROIAlignSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_roialign) - }, + {"fp16_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_roialign)}, #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #if defined(ARM_COMPUTE_ENABLE_NEON) - { - "qu8_neon_roialign", - [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8; }, - REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qu8_roialign) - }, - { - "qs8_neon_roialign", - [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; }, - REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qs8_roialign) - }, + {"qu8_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qu8_roialign)}, + {"qs8_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; }, + REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qs8_roialign)}, #endif //defined(ARM_COMPUTE_ENABLE_NEON) }; @@ -94,9 +87,9 @@ static const ROIAlignKernel available_kernels[] = */ const ROIAlignKernel *get_implementation(const ROIAlignSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -104,24 +97,29 @@ const ROIAlignKernel *get_implementation(const ROIAlignSelectorData &data) return nullptr; } -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output); ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5); ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F32, DataType::F16); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW); ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0)); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), + output->tensor_shape()); } - if(input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED) + if (input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::QASYMM16); @@ -143,13 +141,17 @@ NEROIAlignLayerKernel::NEROIAlignLayerKernel() { } -void NEROIAlignLayerKernel::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info) +void NEROIAlignLayerKernel::configure(const ITensor *input, + const ITensor *rois, + ITensor *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info)); // Output auto inizialitation if not yet initialized const TensorShape output_shape = compute_roi_align_shape(*input->info(), *rois->info(), pool_info); - auto_init_if_empty((*output->info()), output_shape, 1, input->info()->data_type(), input->info()->quantization_info()); + auto_init_if_empty((*output->info()), output_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); output->info()->set_data_layout(input->info()->data_layout()); // Configure kernel window @@ -167,7 +169,10 @@ void NEROIAlignLayerKernel::configure(const ITensor *input, const ITensor *rois, INEKernel::configure(window); } -Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info)); return Status{}; @@ -176,9 +181,9 @@ Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorIn void NEROIAlignLayerKernel::run(const Window &window, const ThreadInfo &info) { const DataLayout data_layout = _input->info()->data_layout(); - if(data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC) + if (data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC) { - const auto *uk = get_implementation(ROIAlignSelectorData{ _input->info()->data_type() }); + const auto *uk = get_implementation(ROIAlignSelectorData{_input->info()->data_type()}); ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); uk->ukernel(_input, _output, _rois, _pool_info, window, info); diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.h b/src/core/NEON/kernels/NEROIAlignLayerKernel.h index 48a3de7285..9cc538b429 100644 --- a/src/core/NEON/kernels/NEROIAlignLayerKernel.h +++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.h @@ -83,7 +83,10 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp index 400e8291d6..1a3810fb56 100644 --- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp +++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp @@ -22,9 +22,11 @@ * SOFTWARE. */ #include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h" + #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -36,7 +38,10 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *rois, + const ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, rois); @@ -47,10 +52,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, con ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32, DataType::QASYMM8); ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0)); - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height())); + ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || + (output->dimension(1) != pool_info.pooled_height())); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2)); ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3)); } @@ -73,19 +79,28 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, con * @param[in] roi_indx Index of image of coordinate in output Tensor to store value */ template -void template_eval(const ITensor *input, const ITensor *output, int region_start_x, int region_start_y, - int region_end_x, int region_end_y, int fm, int px, int py, int roi_batch, int roi_indx) +void template_eval(const ITensor *input, + const ITensor *output, + int region_start_x, + int region_start_y, + int region_end_x, + int region_end_y, + int fm, + int px, + int py, + int roi_batch, + int roi_indx) { - if((region_end_x <= region_start_x) || (region_end_y <= region_start_y)) + if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y)) { *reinterpret_cast(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = 0; } else { T curr_max = std::numeric_limits::lowest(); // Min value of typename T - for(int j = region_start_y; j < region_end_y; ++j) + for (int j = region_start_y; j < region_end_y; ++j) { - for(int i = region_start_x; i < region_end_x; ++i) + for (int i = region_start_x; i < region_end_x; ++i) { const auto val = *reinterpret_cast(input->ptr_to_element(Coordinates(i, j, fm, roi_batch))); curr_max = std::max(val, curr_max); @@ -93,11 +108,13 @@ void template_eval(const ITensor *input, const ITensor *output, int region_start } // if quantized datatype, requantize then store in output tensor - if(is_data_type_quantized(input->info()->data_type())) + if (is_data_type_quantized(input->info()->data_type())) { // covert qasymm to new output quantization scale and offset - UniformQuantizationInfo uqinfo = compute_requantization_scale_offset(input->info()->quantization_info().uniform(), output->info()->quantization_info().uniform()); - *reinterpret_cast(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = quantize_qasymm8(curr_max, uqinfo); + UniformQuantizationInfo uqinfo = compute_requantization_scale_offset( + input->info()->quantization_info().uniform(), output->info()->quantization_info().uniform()); + *reinterpret_cast(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = + quantize_qasymm8(curr_max, uqinfo); } else { @@ -112,13 +129,19 @@ NEROIPoolingLayerKernel::NEROIPoolingLayerKernel() { } -Status NEROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status NEROIPoolingLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *rois, + const ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info)); return Status{}; } -void NEROIPoolingLayerKernel::configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info) +void NEROIPoolingLayerKernel::configure(const ITensor *input, + const ITensor *rois, + const ITensor *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois); @@ -126,12 +149,15 @@ void NEROIPoolingLayerKernel::configure(const ITensor *input, const ITensor *roi ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info)); // Output auto initialization if not yet initialized - TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->info()->dimension(1)); + TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), + rois->info()->dimension(1)); - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), output->info()->quantization_info()); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), + output->info()->quantization_info()); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height())); + ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || + (output->info()->dimension(1) != pool_info.pooled_height())); // Set instance variables _input = input; @@ -167,7 +193,7 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info) const auto *rois_ptr = reinterpret_cast(_rois->buffer()); const auto data_type = _input->info()->data_type(); - for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx) + for (int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx) { const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx]; const auto x1 = rois_ptr[values_per_roi * roi_indx + 1]; @@ -182,30 +208,35 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info) const int roi_height = std::max(support::cpp11::round((y2 - y1) * spatial_scale), 1.f); // Iterate through all feature maps - for(int fm = 0; fm < fms; ++fm) + for (int fm = 0; fm < fms; ++fm) { // Iterate through all output pixels - for(int py = 0; py < pooled_h; ++py) + for (int py = 0; py < pooled_h; ++py) { - for(int px = 0; px < pooled_w; ++px) + for (int px = 0; px < pooled_w; ++px) { auto region_start_x = static_cast(std::floor((static_cast(px) / pooled_w) * roi_width)); - auto region_end_x = static_cast(std::floor((static_cast(px + 1) / pooled_w) * roi_width)); - auto region_start_y = static_cast(std::floor((static_cast(py) / pooled_h) * roi_height)); - auto region_end_y = static_cast(std::floor((static_cast(py + 1) / pooled_h) * roi_height)); + auto region_end_x = + static_cast(std::floor((static_cast(px + 1) / pooled_w) * roi_width)); + auto region_start_y = + static_cast(std::floor((static_cast(py) / pooled_h) * roi_height)); + auto region_end_y = + static_cast(std::floor((static_cast(py + 1) / pooled_h) * roi_height)); region_start_x = std::min(std::max(region_start_x + roi_anchor_x, 0), width); region_end_x = std::min(std::max(region_end_x + roi_anchor_x, 0), width); region_start_y = std::min(std::max(region_start_y + roi_anchor_y, 0), height); region_end_y = std::min(std::max(region_end_y + roi_anchor_y, 0), height); - switch(data_type) + switch (data_type) { case DataType::F32: - template_eval(_input, _output, region_start_x, region_start_y, region_end_x, region_end_y, fm, px, py, roi_batch, roi_indx); + template_eval(_input, _output, region_start_x, region_start_y, region_end_x, + region_end_y, fm, px, py, roi_batch, roi_indx); break; case DataType::QASYMM8: - template_eval(_input, _output, region_start_x, region_start_y, region_end_x, region_end_y, fm, px, py, roi_batch, roi_indx); + template_eval(_input, _output, region_start_x, region_start_y, region_end_x, + region_end_y, fm, px, py, roi_batch, roi_indx); break; default: ARM_COMPUTE_ERROR("DataType not Supported"); @@ -216,4 +247,4 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info) } } } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.h b/src/core/NEON/kernels/NEROIPoolingLayerKernel.h index e7a7e90eef..81f6006ea2 100644 --- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.h +++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.h @@ -63,7 +63,8 @@ public: * @note The z dimensions of @p output tensor and @p input tensor must be the same. * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois tensor. */ - void configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info); + void + configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -82,7 +83,10 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *rois, + const ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info); private: const ITensor *_input; diff --git a/src/core/NEON/kernels/NERangeKernel.cpp b/src/core/NEON/kernels/NERangeKernel.cpp index ec63a35de9..87b7b76b72 100644 --- a/src/core/NEON/kernels/NERangeKernel.cpp +++ b/src/core/NEON/kernels/NERangeKernel.cpp @@ -29,11 +29,12 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" -#include "src/core/NEON/NEAsymm.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/common/Registrars.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/range/list.h" namespace arm_compute @@ -55,48 +56,23 @@ struct RangeUKernel RangeUKernelPtr ukernel; }; -static const RangeUKernel available_kernels[] = -{ - { - "fp16_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::F16; }, - REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_range_function) - }, - { - "f32_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::F32; }, - REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_range_function) - }, - { - "u8_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::U8; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_range_function) - }, - { - "u16_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::U16; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::u16_neon_range_function) - }, - { - "u32_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::U32; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::u32_neon_range_function) - }, - { - "s8_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::S8; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_range_function) - }, - { - "s16_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::S16; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_range_function) - }, - { - "s32_neon_range", - [](const RangeSelectorData & data) { return data.dt == DataType::S32; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::s32_neon_range_function) - }, +static const RangeUKernel available_kernels[] = { + {"fp16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::F16; }, + REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_range_function)}, + {"f32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::F32; }, + REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_range_function)}, + {"u8_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_range_function)}, + {"u16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U16; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u16_neon_range_function)}, + {"u32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U32; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::u32_neon_range_function)}, + {"s8_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S8; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_range_function)}, + {"s16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S16; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_range_function)}, + {"s32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S32; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::s32_neon_range_function)}, }; /** Micro-kernel selector @@ -107,9 +83,9 @@ static const RangeUKernel available_kernels[] = */ const RangeUKernel *get_implementation(const RangeSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -119,28 +95,31 @@ const RangeUKernel *get_implementation(const RangeSelectorData &data) Status validate_arguments(const ITensorInfo &output, const float start, const float end, const float step) { - const auto *uk = get_implementation(RangeSelectorData{ output.data_type() }); + const auto *uk = get_implementation(RangeSelectorData{output.data_type()}); ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr); ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start < end) && (step <= 0)), "step must be greater than 0 when start < end"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start > end) && (step >= 0)), "step must be less than 0 when start > end"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()), "start value is outside the range of the data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()), "end value is outside the range of the data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()), "step value is outside the range of the data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()), + "start value is outside the range of the data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()), + "end value is outside the range of the data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()), + "step value is outside the range of the data type"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.num_dimensions() != 1, "Output has to be a 1-D tensor"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step), + "Output tensor size is incorrect"); return Status{}; } } // namespace -NERangeKernel::NERangeKernel() - : _start(0), _end(1), _step(1), _output(nullptr) +NERangeKernel::NERangeKernel() : _start(0), _end(1), _step(1), _output(nullptr) { } @@ -151,7 +130,8 @@ void NERangeKernel::configure(ITensor *output, float start, float end, float ste ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*(output->info()), start, end, step)); // Auto initialize output if not initialized - auto_init_if_empty(*output->info(), TensorShape(num_of_elements_in_range(start, end, step)), 1, output->info()->data_type(), output->info()->quantization_info()); + auto_init_if_empty(*output->info(), TensorShape(num_of_elements_in_range(start, end, step)), 1, + output->info()->data_type(), output->info()->quantization_info()); // Configure kernel window Window win = calculate_max_window(*output->info(), Steps()); @@ -178,7 +158,7 @@ void NERangeKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - const auto *uk = get_implementation(RangeSelectorData{ _output->info()->data_type() }); + const auto *uk = get_implementation(RangeSelectorData{_output->info()->data_type()}); uk->ukernel(_output, _start, _step, window); } diff --git a/src/core/NEON/kernels/NERangeKernel.h b/src/core/NEON/kernels/NERangeKernel.h index 90560995e6..fa555c2c2e 100644 --- a/src/core/NEON/kernels/NERangeKernel.h +++ b/src/core/NEON/kernels/NERangeKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NERANGEKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp index 19955af493..455d604b3b 100644 --- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp +++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp @@ -28,16 +28,17 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/CPP/Validate.h" -#include "src/core/NEON/INEKernel.h" -#include "src/core/NEON/NEMath.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/INEKernel.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" #include "support/SaturateCast.h" -#include "src/core/NEON/wrapper/wrapper.h" #include namespace arm_compute @@ -48,7 +49,7 @@ namespace template void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output, int offset = 0) { - if(std::is_same::value) + if (std::is_same::value) { auto res = wrapper::vcombine(wrapper::vqmovun(t1), wrapper::vqmovun(t2)); wrapper::vstore(output.ptr() + offset, res); @@ -63,8 +64,8 @@ void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output, int offset template uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis) { - uint32x4_t mask{ 0 }; - if(op == ReductionOperation::ARG_IDX_MIN) + uint32x4_t mask{0}; + if (op == ReductionOperation::ARG_IDX_MIN) { mask = wrapper::vcgt(b, a); } @@ -73,12 +74,12 @@ uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOp mask = wrapper::vclt(b, a); } - uint32x4_t vec_idx = { idx, idx + 1, idx + 2, idx + 3 }; - if(axis != 0) + uint32x4_t vec_idx = {idx, idx + 1, idx + 2, idx + 3}; + if (axis != 0) { vec_idx = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); } - uint32x4x4_t res = { { wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0 } }; + uint32x4x4_t res = {{wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0}}; return res; } @@ -86,9 +87,9 @@ uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOp template uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis) { - uint32x4x4_t mask{ { 0 } }; - uint8x16_t mask_u8{ 0 }; - if(op == ReductionOperation::ARG_IDX_MIN) + uint32x4x4_t mask{{0}}; + uint8x16_t mask_u8{0}; + if (op == ReductionOperation::ARG_IDX_MIN) { mask_u8 = wrapper::vcgt(b, a); } @@ -96,44 +97,43 @@ uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c, R { mask_u8 = wrapper::vclt(b, a); } - auto wide_u16_1 = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8))); - auto wide_u16_2 = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8))); - mask.val[0] = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1))); - mask.val[1] = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1))); - mask.val[2] = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2))); - mask.val[3] = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2))); - - uint32x4x4_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 }, - { idx + 4, idx + 5, idx + 6, idx + 7 }, - { idx + 8, idx + 9, idx + 10, idx + 11 }, - { idx + 12, idx + 13, idx + 14, idx + 15 } - } - }; - if(axis != 0) + auto wide_u16_1 = + wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8))); + auto wide_u16_2 = + wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8))); + mask.val[0] = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1))); + mask.val[1] = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1))); + mask.val[2] = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2))); + mask.val[3] = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2))); + + uint32x4x4_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3}, + {idx + 4, idx + 5, idx + 6, idx + 7}, + {idx + 8, idx + 9, idx + 10, idx + 11}, + {idx + 12, idx + 13, idx + 14, idx + 15}}}; + if (axis != 0) { vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); vec_idx.val[2] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); vec_idx.val[3] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); } - uint32x4x4_t res = - { - { - vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]), - vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]), - vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]), - vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3]) - } - }; + uint32x4x4_t res = { + {vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]), vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]), + vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]), vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])}}; return res; } // Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value. template -inline typename std::enable_if < std::is_same::value || std::is_same::value, - typename std::conditional::value, float32x2_t, int32x2_t>::type >::type - calculate_min(T in) +inline typename std::enable_if< + std::is_same::value || std::is_same::value, + typename std::conditional::value, float32x2_t, int32x2_t>::type>::type +calculate_min(T in) { auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); return wrapper::vpmin(pmin, pmin); @@ -141,9 +141,10 @@ inline typename std::enable_if < std::is_same::value || std::is_ // Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value. template -inline typename std::enable_if < std::is_same::value || std::is_same::value, - typename std::conditional::value, uint8x8_t, int8x8_t>::type >::type - calculate_min(T in) +inline typename std::enable_if< + std::is_same::value || std::is_same::value, + typename std::conditional::value, uint8x8_t, int8x8_t>::type>::type +calculate_min(T in) { auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in)); pmin = wrapper::vpmin(pmin, pmin); @@ -153,9 +154,10 @@ inline typename std::enable_if < std::is_same::value || std::is_s // Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value. template -inline typename std::enable_if < std::is_same::value || std::is_same::value, - typename std::conditional::value, float32x2_t, int32x2_t>::type >::type - calculate_max(T in) +inline typename std::enable_if< + std::is_same::value || std::is_same::value, + typename std::conditional::value, float32x2_t, int32x2_t>::type>::type +calculate_max(T in) { auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); return wrapper::vpmax(pmax, pmax); @@ -163,9 +165,10 @@ inline typename std::enable_if < std::is_same::value || std::is_ // Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value. template -inline typename std::enable_if < std::is_same::value || std::is_same::value, - typename std::conditional::value, uint8x8_t, int8x8_t>::type >::type - calculate_max(T in) +inline typename std::enable_if< + std::is_same::value || std::is_same::value, + typename std::conditional::value, uint8x8_t, int8x8_t>::type>::type +calculate_max(T in) { auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in)); pmax = wrapper::vpmax(pmax, pmax); @@ -176,10 +179,10 @@ inline typename std::enable_if < std::is_same::value || std::is_s template uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op) { - uint32x4_t res_idx_mask{ 0 }; + uint32x4_t res_idx_mask{0}; uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF); - if(op == ReductionOperation::ARG_IDX_MIN) + if (op == ReductionOperation::ARG_IDX_MIN) { auto pmin = calculate_min(vec_res_value); auto mask = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin)); @@ -203,10 +206,10 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, Reduc template uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op) { - uint32x4x4_t res_idx_mask{ { 0 } }; + uint32x4x4_t res_idx_mask{{0}}; uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF); - uint8x16_t mask_u8{ 0 }; - if(op == ReductionOperation::ARG_IDX_MIN) + uint8x16_t mask_u8{0}; + if (op == ReductionOperation::ARG_IDX_MIN) { auto pmin = calculate_min(vec_res_value); mask_u8 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin)); @@ -218,12 +221,18 @@ uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_va } // Widen vectors - auto wide_u16_1 = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8))); - auto wide_u16_2 = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8))); - auto wide_u32_1 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1))); - auto wide_u32_2 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1))); - auto wide_u32_3 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2))); - auto wide_u32_4 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2))); + auto wide_u16_1 = + wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8))); + auto wide_u16_2 = + wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8))); + auto wide_u32_1 = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1))); + auto wide_u32_2 = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1))); + auto wide_u32_3 = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2))); + auto wide_u32_4 = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2))); res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1); res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2); res_idx_mask.val[2] = wrapper::vand(vec_res_idx.val[2], wide_u32_3); @@ -241,19 +250,19 @@ uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_va pmin = wrapper::vpmin(pmin, pmin); res = std::min(wrapper::vgetlane(pmin, 0), res); iter++; - } - while(iter < 4); + } while (iter < 4); return (res - 0xFFFFFFFF); } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC template <> -uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis) +uint32x4x4_t +calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis) { - uint32x4x2_t mask{ 0 }; - uint16x8_t mask_u16{ 0 }; - if(op == ReductionOperation::ARG_IDX_MIN) + uint32x4x2_t mask{0}; + uint16x8_t mask_u16{0}; + if (op == ReductionOperation::ARG_IDX_MIN) { mask_u16 = wrapper::vcgt(b, a); } @@ -263,19 +272,14 @@ uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x } mask.val[0] = wrapper::vmovl(wrapper::vgetlow(mask_u16)); mask.val[1] = wrapper::vmovl(wrapper::vgethigh(mask_u16)); - uint32x4x2_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 }, - { idx + 4, idx + 5, idx + 6, idx + 7 } - } - }; - if(axis != 0) + uint32x4x2_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3}, {idx + 4, idx + 5, idx + 6, idx + 7}}}; + if (axis != 0) { vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{}); } - uint32x4x4_t res = { wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]), - wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]), - 0, 0 - }; + uint32x4x4_t res = {wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]), + wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]), 0, 0}; return res; } @@ -298,10 +302,10 @@ inline float16x4_t calculate_max(float16x8_t in) template <> uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value, ReductionOperation op) { - uint32x4x2_t res_idx_mask{ 0 }; + uint32x4x2_t res_idx_mask{0}; uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF); uint16x8_t mask_u16; - if(op == ReductionOperation::ARG_IDX_MIN) + if (op == ReductionOperation::ARG_IDX_MIN) { auto pmin = calculate_min(vec_res_value); mask_u16 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin)); @@ -313,8 +317,10 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_va } // Widen vectors - auto wide_u32_1 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16))); - auto wide_u32_2 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16))); + auto wide_u32_1 = + wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16))); + auto wide_u32_2 = + wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16))); res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1); res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2); res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones); @@ -328,8 +334,7 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_va pmin = wrapper::vpmin(pmin, pmin); res = std::min(wrapper::vgetlane(pmin, 0), res); iter++; - } - while(iter < 2); + } while (iter < 2); return (res - 0xFFFFFFFF); } @@ -388,7 +393,8 @@ struct RedOpX /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_vector::tag_type; - inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op) + inline void operator()( + const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op) { const size_t input_dim_0 = in->info()->dimension(0); const int window_step_x = 16 / sizeof(T); @@ -402,211 +408,217 @@ struct RedOpX Iterator output(out, out_window); execute_window_loop( - in_win_no_pad, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - - auto init_res_value = static_cast(0.f); - switch(op) + in_win_no_pad, + [&](const Coordinates &) { - case ReductionOperation::ARG_IDX_MAX: - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::MIN: - case ReductionOperation::MAX: - { - init_res_value = static_cast(*input_ptr); - break; - } - case ReductionOperation::PROD: - { - init_res_value = static_cast(1.f); - break; - } - default: - break; - } - auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{}); - uint32x4x4_t vec_res_idx{ { 0 } }; + const auto input_ptr = reinterpret_cast(input.ptr()); - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vec_elements = wrapper::vloadq(input_ptr + x); - switch(op) + auto init_res_value = static_cast(0.f); + switch (op) { - case ReductionOperation::SUM_SQUARE: - vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value); - break; - case ReductionOperation::MEAN_SUM: - case ReductionOperation::SUM: - vec_res_value = wrapper::vadd(vec_elements, vec_res_value); - break; - case ReductionOperation::PROD: - vec_res_value = wrapper::vmul(vec_elements, vec_res_value); - break; - case ReductionOperation::ARG_IDX_MIN: - { - auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - vec_res_idx = calculate_index(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); - vec_res_value = temp_vec_res_value; - break; - } case ReductionOperation::ARG_IDX_MAX: - { - auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - vec_res_idx = calculate_index(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); - vec_res_value = temp_vec_res_value; - break; - } + case ReductionOperation::ARG_IDX_MIN: case ReductionOperation::MIN: + case ReductionOperation::MAX: { - vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + init_res_value = static_cast(*input_ptr); break; } - case ReductionOperation::MAX: + case ReductionOperation::PROD: { - vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + init_res_value = static_cast(1.f); break; } default: - ARM_COMPUTE_ERROR("Not supported"); + break; } - } + auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{}); + uint32x4x4_t vec_res_idx{{0}}; - switch(op) - { - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - case ReductionOperation::SUM_SQUARE: + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { -#ifdef ARM_COMPUTE_DEBUG_ENABLED - auto res = static_cast(0.f); - for(int i = 0; i < S; ++i) + const auto vec_elements = wrapper::vloadq(input_ptr + x); + switch (op) { - res += wrapper::vgetlane(vec_res_value, i); + case ReductionOperation::SUM_SQUARE: + vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value); + break; + case ReductionOperation::MEAN_SUM: + case ReductionOperation::SUM: + vec_res_value = wrapper::vadd(vec_elements, vec_res_value); + break; + case ReductionOperation::PROD: + vec_res_value = wrapper::vmul(vec_elements, vec_res_value); + break; + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = calculate_index(x, temp_vec_res_value, vec_res_value, + vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = calculate_index(x, temp_vec_res_value, vec_res_value, + vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReductionOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); } -#else // ARM_COMPUTE_DEBUG_ENABLED - auto carry_res = wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); - for(int i = 0; i < S / 4; ++i) + } + + switch (op) + { + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + case ReductionOperation::SUM_SQUARE: { - carry_res = wrapper::vpadd(carry_res, carry_res); - } - auto res = wrapper::vgetlane(carry_res, 0); +#ifdef ARM_COMPUTE_DEBUG_ENABLED + auto res = static_cast(0.f); + for (int i = 0; i < S; ++i) + { + res += wrapper::vgetlane(vec_res_value, i); + } +#else // ARM_COMPUTE_DEBUG_ENABLED + auto carry_res = + wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); + for (int i = 0; i < S / 4; ++i) + { + carry_res = wrapper::vpadd(carry_res, carry_res); + } + auto res = wrapper::vgetlane(carry_res, 0); #endif // ARM_COMPUTE_DEBUG_ENABLED - if(op == ReductionOperation::SUM_SQUARE) - { - // Compute left-over elements - for(; x < window_end_x; ++x) + if (op == ReductionOperation::SUM_SQUARE) { - res += (*(input_ptr + x)) * (*(input_ptr + x)); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res += (*(input_ptr + x)) * (*(input_ptr + x)); + } + } + else + { + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res += *(input_ptr + x); + } + } + + if (op == ReductionOperation::MEAN_SUM) + { + res /= input_dim_0; } + + *(reinterpret_cast(output.ptr())) = res; + break; } - else + case ReductionOperation::PROD: { + auto carry_res = + wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); + T res = 1; + for (int i = 0; i < S / 2; ++i) + { + res *= wrapper::vgetlane(carry_res, i); + } + // Compute left-over elements - for(; x < window_end_x; ++x) + for (; x < window_end_x; ++x) { - res += *(input_ptr + x); + res *= *(input_ptr + x); } - } - if(op == ReductionOperation::MEAN_SUM) - { - res /= input_dim_0; + *(reinterpret_cast(output.ptr())) = res; + break; } - - *(reinterpret_cast(output.ptr())) = res; - break; - } - case ReductionOperation::PROD: - { - auto carry_res = wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value)); - T res = 1; - for(int i = 0; i < S / 2; ++i) + case ReductionOperation::ARG_IDX_MIN: { - res *= wrapper::vgetlane(carry_res, i); - } + auto idx = calculate_vector_index(vec_res_idx, vec_res_value, op); + auto res = static_cast(wrapper::vgetlane(calculate_min(vec_res_value), 0)); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - res *= *(input_ptr + x); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + if (*(input_ptr + x) < res) + { + idx = x; + res = *(input_ptr + x); + } + } + *(reinterpret_cast(output.ptr())) = idx; + break; } - - *(reinterpret_cast(output.ptr())) = res; - break; - } - case ReductionOperation::ARG_IDX_MIN: - { - auto idx = calculate_vector_index(vec_res_idx, vec_res_value, op); - auto res = static_cast(wrapper::vgetlane(calculate_min(vec_res_value), 0)); - - // Compute left-over elements - for(; x < window_end_x; ++x) + case ReductionOperation::ARG_IDX_MAX: { - if(*(input_ptr + x) < res) + auto idx = calculate_vector_index(vec_res_idx, vec_res_value, op); + auto res = static_cast(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) { - idx = x; - res = *(input_ptr + x); + if (*(input_ptr + x) > res) + { + idx = x; + res = *(input_ptr + x); + } } + *(reinterpret_cast(output.ptr())) = idx; + break; } - *(reinterpret_cast(output.ptr())) = idx; - break; - } - case ReductionOperation::ARG_IDX_MAX: - { - auto idx = calculate_vector_index(vec_res_idx, vec_res_value, op); - auto res = static_cast(wrapper::vgetlane(calculate_max(vec_res_value), 0)); - - // Compute left-over elements - for(; x < window_end_x; ++x) + case ReductionOperation::MIN: { - if(*(input_ptr + x) > res) + auto res = static_cast(wrapper::vgetlane(calculate_min(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) { - idx = x; - res = *(input_ptr + x); + res = *(input_ptr + x) < res ? *(input_ptr + x) : res; } + *(reinterpret_cast(output.ptr())) = res; + break; } - *(reinterpret_cast(output.ptr())) = idx; - break; - } - case ReductionOperation::MIN: - { - auto res = static_cast(wrapper::vgetlane(calculate_min(vec_res_value), 0)); - - // Compute left-over elements - for(; x < window_end_x; ++x) + case ReductionOperation::MAX: { - res = *(input_ptr + x) < res ? *(input_ptr + x) : res; - } - *(reinterpret_cast(output.ptr())) = res; - break; - } - case ReductionOperation::MAX: - { - auto res = static_cast(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + auto res = static_cast(wrapper::vgetlane(calculate_max(vec_res_value), 0)); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - res = *(input_ptr + x) > res ? *(input_ptr + x) : res; + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res = *(input_ptr + x) > res ? *(input_ptr + x) : res; + } + *(reinterpret_cast(output.ptr())) = res; + break; } - *(reinterpret_cast(output.ptr())) = res; - break; + default: + ARM_COMPUTE_ERROR("Not supported"); } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - }, - input, output); + }, + input, output); } }; template struct RedOpX_quantized { - inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op) + inline void operator()( + const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op) { using PromotedType = typename wrapper::traits::promote::type>::type; @@ -637,246 +649,257 @@ struct RedOpX_quantized const float B = out_offset - (in_scale * in_offset) / (out_scale); execute_window_loop( - in_win_no_pad, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); + in_win_no_pad, + [&](const Coordinates &) + { + const auto input_ptr = reinterpret_cast(input.ptr()); + + auto vec_res_value1 = + wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + auto vec_res_value2 = + wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + auto vec_res_value3 = + wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + auto vec_res_value4 = + wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + + auto vec_res_value1_f = vdupq_n_f32(static_cast(1.f)); + auto vec_res_value2_f = vdupq_n_f32(static_cast(1.f)); + auto vec_res_value3_f = vdupq_n_f32(static_cast(1.f)); + auto vec_res_value4_f = vdupq_n_f32(static_cast(1.f)); + + typename wrapper::traits::neon_vector::type vec_res_value = {0}; + + if (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || + op == ReductionOperation::MIN || op == ReductionOperation::MAX) + { + vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{}); + } - auto vec_res_value1 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); - auto vec_res_value2 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); - auto vec_res_value3 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); - auto vec_res_value4 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + uint32x4x4_t vec_res_idx{{0}}; + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vec_elements = wrapper::vloadq(input_ptr + x); + switch (op) + { + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + { + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - auto vec_res_value1_f = vdupq_n_f32(static_cast(1.f)); - auto vec_res_value2_f = vdupq_n_f32(static_cast(1.f)); - auto vec_res_value3_f = vdupq_n_f32(static_cast(1.f)); - auto vec_res_value4_f = vdupq_n_f32(static_cast(1.f)); + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); - typename wrapper::traits::neon_vector::type vec_res_value = { 0 }; + vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1); + vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2); + vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3); + vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4); + break; + } + case ReductionOperation::PROD: + { + const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset); + const auto scale32x4f_4 = vdupq_n_f32(iq_info.scale); - if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::MIN || op == ReductionOperation::MAX) - { - vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{}); - } + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - uint32x4x4_t vec_res_idx{ { 0 } }; - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) - { - const auto vec_elements = wrapper::vloadq(input_ptr + x); - switch(op) - { - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - { - const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); - const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - - const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); - const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); - const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); - const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); - - vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1); - vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2); - vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3); - vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4); - break; - } - case ReductionOperation::PROD: - { - const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset); - const auto scale32x4f_4 = vdupq_n_f32(iq_info.scale); - - const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); - const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - - const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); - const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); - const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); - const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); - - auto temp32x4f_1 = wrapper::vcvt(temp32x4t_1); - auto temp32x4f_2 = wrapper::vcvt(temp32x4t_2); - auto temp32x4f_3 = wrapper::vcvt(temp32x4t_3); - auto temp32x4f_4 = wrapper::vcvt(temp32x4t_4); - - //de-quantize vec_elements - temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4); - temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4); - temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4); - temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4); - - vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f); - vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f); - vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f); - vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f); - break; + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); + + auto temp32x4f_1 = wrapper::vcvt(temp32x4t_1); + auto temp32x4f_2 = wrapper::vcvt(temp32x4t_2); + auto temp32x4f_3 = wrapper::vcvt(temp32x4t_3); + auto temp32x4f_4 = wrapper::vcvt(temp32x4t_4); + + //de-quantize vec_elements + temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4); + temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4); + temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4); + temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4); + + vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f); + vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f); + vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f); + vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f); + break; + } + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = calculate_index_quantized( + x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = calculate_index_quantized( + x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReductionOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); } + } + + switch (op) + { case ReductionOperation::ARG_IDX_MIN: { - auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - vec_res_idx = calculate_index_quantized(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); - vec_res_value = temp_vec_res_value; + auto idx = + calculate_vector_index_quantized(vec_res_idx, vec_res_value, op); + auto res = static_cast(wrapper::vgetlane(calculate_min(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + if (*(input_ptr + x) < res) + { + idx = x; + res = *(input_ptr + x); + } + } + *(reinterpret_cast(output.ptr())) = idx; break; } case ReductionOperation::ARG_IDX_MAX: { - auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - vec_res_idx = calculate_index_quantized(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0); - vec_res_value = temp_vec_res_value; + auto idx = + calculate_vector_index_quantized(vec_res_idx, vec_res_value, op); + auto res = static_cast(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + if (*(input_ptr + x) > res) + { + idx = x; + res = *(input_ptr + x); + } + } + *(reinterpret_cast(output.ptr())) = idx; break; } case ReductionOperation::MIN: { - vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + auto res = static_cast(wrapper::vgetlane(calculate_min(vec_res_value), 0)); + + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res = *(input_ptr + x) < res ? *(input_ptr + x) : res; + } + *(reinterpret_cast(output.ptr())) = res; break; } case ReductionOperation::MAX: { - vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - break; - } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - } - - switch(op) - { - case ReductionOperation::ARG_IDX_MIN: - { - auto idx = calculate_vector_index_quantized(vec_res_idx, vec_res_value, op); - auto res = static_cast(wrapper::vgetlane(calculate_min(vec_res_value), 0)); - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - if(*(input_ptr + x) < res) - { - idx = x; - res = *(input_ptr + x); - } - } - *(reinterpret_cast(output.ptr())) = idx; - break; - } - case ReductionOperation::ARG_IDX_MAX: - { - auto idx = calculate_vector_index_quantized(vec_res_idx, vec_res_value, op); - auto res = static_cast(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + auto res = static_cast(wrapper::vgetlane(calculate_max(vec_res_value), 0)); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - if(*(input_ptr + x) > res) + // Compute left-over elements + for (; x < window_end_x; ++x) { - idx = x; - res = *(input_ptr + x); + res = *(input_ptr + x) > res ? *(input_ptr + x) : res; } + *(reinterpret_cast(output.ptr())) = res; + break; } - *(reinterpret_cast(output.ptr())) = idx; - break; - } - case ReductionOperation::MIN: - { - auto res = static_cast(wrapper::vgetlane(calculate_min(vec_res_value), 0)); - - // Compute left-over elements - for(; x < window_end_x; ++x) + case ReductionOperation::PROD: { - res = *(input_ptr + x) < res ? *(input_ptr + x) : res; - } - *(reinterpret_cast(output.ptr())) = res; - break; - } - case ReductionOperation::MAX: - { - auto res = static_cast(wrapper::vgetlane(calculate_max(vec_res_value), 0)); + auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f); + carry_res = wrapper::vmul(carry_res, vec_res_value3_f); + carry_res = wrapper::vmul(carry_res, vec_res_value4_f); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - res = *(input_ptr + x) > res ? *(input_ptr + x) : res; - } - *(reinterpret_cast(output.ptr())) = res; - break; - } - case ReductionOperation::PROD: - { - auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f); - carry_res = wrapper::vmul(carry_res, vec_res_value3_f); - carry_res = wrapper::vmul(carry_res, vec_res_value4_f); + float res = wrapper::vgetlane(carry_res, 0); + res *= wrapper::vgetlane(carry_res, 1); + res *= wrapper::vgetlane(carry_res, 2); + res *= wrapper::vgetlane(carry_res, 3); - float res = wrapper::vgetlane(carry_res, 0); - res *= wrapper::vgetlane(carry_res, 1); - res *= wrapper::vgetlane(carry_res, 2); - res *= wrapper::vgetlane(carry_res, 3); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + //de-quantize input + if (std::is_same::value) + { + res *= dequantize_qasymm8(*(input_ptr + x), iq_info); + } + else + { + res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info); + } + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - //de-quantize input - if(std::is_same::value) + //re-quantize result + if (std::is_same::value) { - res *= dequantize_qasymm8(*(input_ptr + x), iq_info); + res = quantize_qasymm8(res, iq_info); } else { - res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info); + res = quantize_qasymm8_signed(res, iq_info); } - } - //re-quantize result - if(std::is_same::value) - { - res = quantize_qasymm8(res, iq_info); + *reinterpret_cast(output.ptr()) = static_cast(res); + break; } - else + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: { - res = quantize_qasymm8_signed(res, iq_info); - } + auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2); + carry_res = wrapper::vadd(carry_res, vec_res_value3); + carry_res = wrapper::vadd(carry_res, vec_res_value4); - *reinterpret_cast(output.ptr()) = static_cast(res); - break; - } - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - { - auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2); - carry_res = wrapper::vadd(carry_res, vec_res_value3); - carry_res = wrapper::vadd(carry_res, vec_res_value4); + auto carry_paddition = + wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res)); + carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition); + auto res = static_cast(wrapper::vgetlane(carry_paddition, 0)); - auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res)); - carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition); - auto res = static_cast(wrapper::vgetlane(carry_paddition, 0)); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + res += *(input_ptr + x); + } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - res += *(input_ptr + x); - } + if (op == ReductionOperation::MEAN_SUM) + { + const int32_t resFinal = A * (static_cast(res)) + B; - if(op == ReductionOperation::MEAN_SUM) - { - const int32_t resFinal = A * (static_cast(res)) + B; + *reinterpret_cast(output.ptr()) = utils::cast::saturate_cast(resFinal); + } + else + { + // Subtract accumulated offsets + res -= (in_info.dimension(0) - 1) * iq_info.offset; + *reinterpret_cast(output.ptr()) = utils::cast::saturate_cast(res); + } - *reinterpret_cast(output.ptr()) = utils::cast::saturate_cast(resFinal); - } - else - { - // Subtract accumulated offsets - res -= (in_info.dimension(0) - 1) * iq_info.offset; - *reinterpret_cast(output.ptr()) = utils::cast::saturate_cast(res); + break; } - - break; + default: + ARM_COMPUTE_ERROR("Not supported"); } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - }, - input, output); + }, + input, output); } }; @@ -887,7 +910,12 @@ struct RedOpYZW using ExactTagType = typename wrapper::traits::neon_vector::tag_type; using neon_vector = typename wrapper::traits::neon_vector::type; - inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op) + inline void operator()(const Window &in_window, + Window &out_window, + const ITensor *in, + ITensor *out, + int axis, + const ReductionOperation op) { const TensorInfo in_info = *(in->info()); const int window_step_x = 16 / sizeof(T); @@ -900,203 +928,210 @@ struct RedOpYZW Window in_win_no_pad = in_window; in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x())); Window out_win_no_pad = out_window; - out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); + out_win_no_pad.set(Window::DimX, + Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); Iterator input(in, in_win_no_pad); Iterator output(out, out_win_no_pad); execute_window_loop( - in_win_no_pad, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + in_win_no_pad, + [&](const Coordinates &) { - neon_vector vec_res_value = { 0 }; - switch(op) - { - case ReductionOperation::ARG_IDX_MAX: - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::MIN: - case ReductionOperation::MAX: - { - vec_res_value = wrapper::vloadq(input_ptr + x); - break; - } - case ReductionOperation::PROD: - { - vec_res_value = wrapper::vdup_n(static_cast(1.f), ExactTagType{}); - break; - } - default: - { - vec_res_value = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - break; - } - } - uint32x4x4_t vec_res_idx{ { 0 } }; + const auto input_ptr = reinterpret_cast(input.ptr()); - for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - const T *in_ptr = reinterpret_cast(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim); - const auto vec_elements = wrapper::vloadq(in_ptr); - switch(op) + neon_vector vec_res_value = {0}; + switch (op) { - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - vec_res_value = wrapper::vadd(vec_elements, vec_res_value); - break; - case ReductionOperation::SUM_SQUARE: - vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value); - break; - case ReductionOperation::PROD: - vec_res_value = wrapper::vmul(vec_elements, vec_res_value); - break; - case ReductionOperation::ARG_IDX_MIN: - { - auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); - vec_res_value = temp_vec_res_value; - break; - } case ReductionOperation::ARG_IDX_MAX: + case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::MIN: + case ReductionOperation::MAX: { - auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); - vec_res_value = temp_vec_res_value; + vec_res_value = wrapper::vloadq(input_ptr + x); break; } - case ReductionOperation::MIN: + case ReductionOperation::PROD: { - vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_value = wrapper::vdup_n(static_cast(1.f), ExactTagType{}); break; } - case ReductionOperation::MAX: + default: { - vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_value = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); break; } - default: - ARM_COMPUTE_ERROR("Not supported"); } - } + uint32x4x4_t vec_res_idx{{0}}; - if(op == ReductionOperation::MEAN_SUM) - { - auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast(in_info.dimension(axis)), ExactTagType{})); - vec_res_value = wrapper::vmul(vec_res_value, vec_width_inv); - } - - if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX) - { - wrapper::vstore(reinterpret_cast(output.ptr()) + x, vec_res_idx.val[0]); -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - if(std::is_same::value) + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) { - wrapper::vstore(reinterpret_cast(output.ptr()) + x + 4, vec_res_idx.val[1]); + const T *in_ptr = + reinterpret_cast(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim); + const auto vec_elements = wrapper::vloadq(in_ptr); + switch (op) + { + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + vec_res_value = wrapper::vadd(vec_elements, vec_res_value); + break; + case ReductionOperation::SUM_SQUARE: + vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value); + break; + case ReductionOperation::PROD: + vec_res_value = wrapper::vmul(vec_elements, vec_res_value); + break; + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = + calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = + calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReductionOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } } -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - } - else - { - wrapper::vstore(reinterpret_cast(output.ptr() + x * sizeof(T)), vec_res_value); - } - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - auto res_value = 0.f; - switch(op) - { - case ReductionOperation::ARG_IDX_MAX: - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::MIN: - case ReductionOperation::MAX: + if (op == ReductionOperation::MEAN_SUM) { - res_value = *(input_ptr + x); - break; + auto vec_width_inv = + wrapper::vinv(wrapper::vdup_n(static_cast(in_info.dimension(axis)), ExactTagType{})); + vec_res_value = wrapper::vmul(vec_res_value, vec_width_inv); } - case ReductionOperation::PROD: + + if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX) { - res_value = static_cast(1.f); - break; + wrapper::vstore(reinterpret_cast(output.ptr()) + x, vec_res_idx.val[0]); +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + if (std::is_same::value) + { + wrapper::vstore(reinterpret_cast(output.ptr()) + x + 4, vec_res_idx.val[1]); + } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC } - default: + else { - res_value = static_cast(0.f); - break; + wrapper::vstore(reinterpret_cast(output.ptr() + x * sizeof(T)), vec_res_value); } } - uint32_t res_idx = 0; - for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + // Compute left-over elements + for (; x < window_end_x; ++x) { - const T *in_ptr = reinterpret_cast(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim); - - switch(op) + auto res_value = 0.f; + switch (op) { - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - res_value += *in_ptr; - break; - case ReductionOperation::SUM_SQUARE: - res_value += *in_ptr * *in_ptr; - break; - case ReductionOperation::PROD: - res_value *= *in_ptr; - break; + case ReductionOperation::ARG_IDX_MAX: case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::MIN: + case ReductionOperation::MAX: { - if(*in_ptr < res_value) - { - res_value = *in_ptr; - res_idx = dim; - } + res_value = *(input_ptr + x); break; } - case ReductionOperation::ARG_IDX_MAX: + case ReductionOperation::PROD: { - if(*in_ptr > res_value) - { - res_value = *in_ptr; - res_idx = dim; - } + res_value = static_cast(1.f); break; } - case ReductionOperation::MIN: + default: { - res_value = *in_ptr < res_value ? *in_ptr : res_value; + res_value = static_cast(0.f); break; } - case ReductionOperation::MAX: + } + + uint32_t res_idx = 0; + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + const T *in_ptr = + reinterpret_cast(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim); + + switch (op) { - res_value = *in_ptr > res_value ? *in_ptr : res_value; - break; + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + res_value += *in_ptr; + break; + case ReductionOperation::SUM_SQUARE: + res_value += *in_ptr * *in_ptr; + break; + case ReductionOperation::PROD: + res_value *= *in_ptr; + break; + case ReductionOperation::ARG_IDX_MIN: + { + if (*in_ptr < res_value) + { + res_value = *in_ptr; + res_idx = dim; + } + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + if (*in_ptr > res_value) + { + res_value = *in_ptr; + res_idx = dim; + } + break; + } + case ReductionOperation::MIN: + { + res_value = *in_ptr < res_value ? *in_ptr : res_value; + break; + } + case ReductionOperation::MAX: + { + res_value = *in_ptr > res_value ? *in_ptr : res_value; + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); } - default: - ARM_COMPUTE_ERROR("Not supported"); } - } - if(op == ReductionOperation::MEAN_SUM) - { - res_value /= in_info.dimension(axis); - } + if (op == ReductionOperation::MEAN_SUM) + { + res_value /= in_info.dimension(axis); + } - if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX) - { - *(reinterpret_cast(output.ptr()) + x) = res_idx; - } - else - { - *(reinterpret_cast(output.ptr() + x * sizeof(T))) = res_value; + if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX) + { + *(reinterpret_cast(output.ptr()) + x) = res_idx; + } + else + { + *(reinterpret_cast(output.ptr() + x * sizeof(T))) = res_value; + } } - } - }, - input, output); + }, + input, output); } }; @@ -1107,7 +1142,8 @@ struct RedOpYZW_complex using ExactTagType = typename wrapper::traits::neon_vector::tag_type; using neon_vector = typename wrapper::traits::neon_vector::type; - inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation) + inline void operator()( + const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation) { ARM_COMPUTE_ERROR_ON(axis != 2); ARM_COMPUTE_ERROR_ON(op != ReductionOperation::SUM); @@ -1124,70 +1160,77 @@ struct RedOpYZW_complex Window in_win_no_pad = in_window; in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x())); Window out_win_no_pad = out_window; - out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); + out_win_no_pad.set(Window::DimX, + Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); Iterator input(in, in_win_no_pad); Iterator output(out, out_win_no_pad); execute_window_loop( - in_win_no_pad, [&](const Coordinates &) - { - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + in_win_no_pad, + [&](const Coordinates &) { - neon_vector vec_res_value_0 = { 0 }; - neon_vector vec_res_value_1 = { 0 }; - - vec_res_value_0 = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - vec_res_value_1 = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - - T *out_ptr = reinterpret_cast(output.ptr() + 2 * x * sizeof(T)); - for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - T *in_ptr_0 = reinterpret_cast(input.ptr() + 2 * x * sizeof(T) + stride_z * dim); - T *in_ptr_1 = reinterpret_cast(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim); + neon_vector vec_res_value_0 = {0}; + neon_vector vec_res_value_1 = {0}; - const auto vec_elements_0 = wrapper::vloadq(in_ptr_0); - const auto vec_elements_1 = wrapper::vloadq(in_ptr_1); + vec_res_value_0 = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); + vec_res_value_1 = wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0); - vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1); - } + T *out_ptr = reinterpret_cast(output.ptr() + 2 * x * sizeof(T)); + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + T *in_ptr_0 = reinterpret_cast(input.ptr() + 2 * x * sizeof(T) + stride_z * dim); + T *in_ptr_1 = reinterpret_cast(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim); - wrapper::vstore(out_ptr, vec_res_value_0); - wrapper::vstore(out_ptr + 4, vec_res_value_1); - } + const auto vec_elements_0 = wrapper::vloadq(in_ptr_0); + const auto vec_elements_1 = wrapper::vloadq(in_ptr_1); - // Compute left-over elements - for(; x < window_end_x; ++x) - { - auto res_value_0 = 0.f; - auto res_value_1 = 0.f; + vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0); + vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1); + } - T *out_ptr = reinterpret_cast(output.ptr() + 2 * x * sizeof(T)); - for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + wrapper::vstore(out_ptr, vec_res_value_0); + wrapper::vstore(out_ptr + 4, vec_res_value_1); + } + + // Compute left-over elements + for (; x < window_end_x; ++x) { - T *in_ptr = reinterpret_cast(input.ptr() + 2 * x * sizeof(T) + stride_z * dim); - res_value_0 += *in_ptr; - res_value_1 += *(in_ptr + 1); + auto res_value_0 = 0.f; + auto res_value_1 = 0.f; + + T *out_ptr = reinterpret_cast(output.ptr() + 2 * x * sizeof(T)); + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + T *in_ptr = reinterpret_cast(input.ptr() + 2 * x * sizeof(T) + stride_z * dim); + res_value_0 += *in_ptr; + res_value_1 += *(in_ptr + 1); + } + *out_ptr = res_value_0; + *(out_ptr + 1) = res_value_1; } - *out_ptr = res_value_0; - *(out_ptr + 1) = res_value_1; - } - }, - input, output); + }, + input, output); } }; template struct RedOpYZW_quantized { - inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op) + inline void operator()(const Window &in_window, + Window &out_window, + const ITensor *in, + ITensor *out, + int axis, + const ReductionOperation op) { const TensorInfo in_info = *(in->info()); const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform(); - using PromotedType = typename wrapper::traits::promote::type>::type; + using PromotedType = typename wrapper::traits::promote::type>::type; const auto oq_info = out->info()->quantization_info().uniform(); @@ -1201,12 +1244,14 @@ struct RedOpYZW_quantized Window in_win_no_pad = in_window; in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x())); Window out_win_no_pad = out_window; - out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); + out_win_no_pad.set(Window::DimX, + Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x())); Iterator input(in, in_win_no_pad); Iterator output(out, out_win_no_pad); - using vector_type = typename wrapper::traits::neon_bitvector::type; + using vector_type = + typename wrapper::traits::neon_bitvector::type; using vector_type_f = typename wrapper::traits::neon_vector::type; vector_type vec_res_value1{}; @@ -1234,362 +1279,384 @@ struct RedOpYZW_quantized const auto vec_B = wrapper::vdup_n(static_cast(B), wrapper::traits::vector_128_tag{}); execute_window_loop( - in_win_no_pad, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - - // Compute window_step_x elements per iteration - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + in_win_no_pad, + [&](const Coordinates &) { - uint32x4x4_t vec_res_idx{ { 0 } }; - vec_res_value1 = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); - vec_res_value2 = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); - vec_res_value3 = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); - vec_res_value4 = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); + const auto input_ptr = reinterpret_cast(input.ptr()); + + // Compute window_step_x elements per iteration + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) + { + uint32x4x4_t vec_res_idx{{0}}; + vec_res_value1 = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); + vec_res_value2 = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); + vec_res_value3 = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); + vec_res_value4 = wrapper::vdup_n(static_cast(0), wrapper::traits::vector_128_tag{}); - vec_res_value1_f = wrapper::vdup_n(static_cast(1), wrapper::traits::vector_128_tag{}); - vec_res_value2_f = wrapper::vdup_n(static_cast(1), wrapper::traits::vector_128_tag{}); - vec_res_value3_f = wrapper::vdup_n(static_cast(1), wrapper::traits::vector_128_tag{}); - vec_res_value4_f = wrapper::vdup_n(static_cast(1), wrapper::traits::vector_128_tag{}); + vec_res_value1_f = wrapper::vdup_n(static_cast(1), wrapper::traits::vector_128_tag{}); + vec_res_value2_f = wrapper::vdup_n(static_cast(1), wrapper::traits::vector_128_tag{}); + vec_res_value3_f = wrapper::vdup_n(static_cast(1), wrapper::traits::vector_128_tag{}); + vec_res_value4_f = wrapper::vdup_n(static_cast(1), wrapper::traits::vector_128_tag{}); - auto vec_res_value = wrapper::vloadq(input_ptr + x); + auto vec_res_value = wrapper::vloadq(input_ptr + x); - for(unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim) - { - const T *in_ptr = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim; - const auto vec_elements = wrapper::vloadq(in_ptr); - switch(op) + for (unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim) { - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: + const T *in_ptr = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim; + const auto vec_elements = wrapper::vloadq(in_ptr); + switch (op) { - const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); - const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - - const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); - const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); - const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); - const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); - - vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1); - vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2); - vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3); - vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4); - break; + case ReductionOperation::SUM: + case ReductionOperation::MEAN_SUM: + { + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); + + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); + + vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1); + vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2); + vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3); + vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4); + break; + } + case ReductionOperation::PROD: + { + const auto offset32x4f_4 = wrapper::vdup_n(static_cast(iq_info.offset), + wrapper::traits::vector_128_tag{}); + const auto scale32x4f_4 = + wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{}); + + const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); + const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); + + const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); + const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); + const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); + const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); + + auto temp32x4f_1 = wrapper::vcvt(temp32x4t_1); + auto temp32x4f_2 = wrapper::vcvt(temp32x4t_2); + auto temp32x4f_3 = wrapper::vcvt(temp32x4t_3); + auto temp32x4f_4 = wrapper::vcvt(temp32x4t_4); + + //de-quantize vec_elements + temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4); + temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4); + temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4); + temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4); + + vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f); + vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f); + vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f); + vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f); + break; + } + case ReductionOperation::ARG_IDX_MIN: + { + auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, + vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, + vec_res_idx, op, axis); + vec_res_value = temp_vec_res_value; + break; + } + case ReductionOperation::MIN: + { + vec_res_value = wrapper::vmin(vec_elements, vec_res_value); + break; + } + case ReductionOperation::MAX: + { + vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); } - case ReductionOperation::PROD: - { - const auto offset32x4f_4 = wrapper::vdup_n(static_cast(iq_info.offset), wrapper::traits::vector_128_tag{}); - const auto scale32x4f_4 = wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{}); - - const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements)); - const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements)); - - const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1)); - const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1)); - const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2)); - const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2)); - - auto temp32x4f_1 = wrapper::vcvt(temp32x4t_1); - auto temp32x4f_2 = wrapper::vcvt(temp32x4t_2); - auto temp32x4f_3 = wrapper::vcvt(temp32x4t_3); - auto temp32x4f_4 = wrapper::vcvt(temp32x4t_4); + } - //de-quantize vec_elements - temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4); - temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4); - temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4); - temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4); - - vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f); - vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f); - vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f); - vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f); - break; - } + switch (op) + { case ReductionOperation::ARG_IDX_MIN: - { - auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); - vec_res_value = temp_vec_res_value; - break; - } case ReductionOperation::ARG_IDX_MAX: { - auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value); - vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis); - vec_res_value = temp_vec_res_value; + wrapper::vstore(reinterpret_cast(output.ptr() + 4 * x), vec_res_idx.val[0]); + wrapper::vstore(reinterpret_cast(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]); + wrapper::vstore(reinterpret_cast(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]); + wrapper::vstore(reinterpret_cast(output.ptr() + 4 * x) + 12, + vec_res_idx.val[3]); break; } case ReductionOperation::MIN: - { - vec_res_value = wrapper::vmin(vec_elements, vec_res_value); - break; - } case ReductionOperation::MAX: { - vec_res_value = wrapper::vmax(vec_elements, vec_res_value); + wrapper::vstore(reinterpret_cast(output.ptr() + x), vec_res_value); break; } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - } - - switch(op) - { - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::ARG_IDX_MAX: - { - wrapper::vstore(reinterpret_cast(output.ptr() + 4 * x), vec_res_idx.val[0]); - wrapper::vstore(reinterpret_cast(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]); - wrapper::vstore(reinterpret_cast(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]); - wrapper::vstore(reinterpret_cast(output.ptr() + 4 * x) + 12, vec_res_idx.val[3]); - break; - } - case ReductionOperation::MIN: - case ReductionOperation::MAX: - { - wrapper::vstore(reinterpret_cast(output.ptr() + x), vec_res_value); - break; - } - case ReductionOperation::SUM: - { - // Subtract offsets - auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset); + case ReductionOperation::SUM: + { + // Subtract offsets + auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset); - auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1); - auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2); - auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3); - auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4); + auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1); + auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2); + auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3); + auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4); - vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets); - vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets); - vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets); - vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets); + vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets); + vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets); + vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets); + vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets); - const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2)); - const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4)); + const auto temp16x8t_1 = + wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2)); + const auto temp16x8t_2 = + wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4)); - combine_and_store(temp16x8t_1, temp16x8t_2, output, x); - break; - } - case ReductionOperation::MEAN_SUM: - { - vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt(vec_res_value1), vec_A); - vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt(vec_res_value2), vec_A); - vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt(vec_res_value3), vec_A); - vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt(vec_res_value4), vec_A); + combine_and_store(temp16x8t_1, temp16x8t_2, output, x); + break; + } + case ReductionOperation::MEAN_SUM: + { + vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt(vec_res_value1), vec_A); + vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt(vec_res_value2), vec_A); + vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt(vec_res_value3), vec_A); + vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt(vec_res_value4), vec_A); #ifdef __aarch64__ - vec_res_value1 = wrapper::vcvta(vec_res_value1_f); - vec_res_value2 = wrapper::vcvta(vec_res_value2_f); - vec_res_value3 = wrapper::vcvta(vec_res_value3_f); - vec_res_value4 = wrapper::vcvta(vec_res_value4_f); + vec_res_value1 = wrapper::vcvta(vec_res_value1_f); + vec_res_value2 = wrapper::vcvta(vec_res_value2_f); + vec_res_value3 = wrapper::vcvta(vec_res_value3_f); + vec_res_value4 = wrapper::vcvta(vec_res_value4_f); #else // defined(__aarch64__) - vec_res_value1 = wrapper::vcvt(vec_res_value1_f); - vec_res_value2 = wrapper::vcvt(vec_res_value2_f); - vec_res_value3 = wrapper::vcvt(vec_res_value3_f); - vec_res_value4 = wrapper::vcvt(vec_res_value4_f); + vec_res_value1 = wrapper::vcvt(vec_res_value1_f); + vec_res_value2 = wrapper::vcvt(vec_res_value2_f); + vec_res_value3 = wrapper::vcvt(vec_res_value3_f); + vec_res_value4 = wrapper::vcvt(vec_res_value4_f); #endif // __aarch64__ - const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2)); - const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4)); - auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2)); + const auto temp16x8t_1 = + wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2)); + const auto temp16x8t_2 = + wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4)); + auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2)); - wrapper::vstore(reinterpret_cast(output.ptr() + x), res); - break; - } - case ReductionOperation::PROD: - { - const auto offset32x4f_4 = wrapper::vdup_n(static_cast(iq_info.offset), wrapper::traits::vector_128_tag{}); - const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale)); - - //re-quantize - vec_res_value1_f = wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4); - vec_res_value2_f = wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4); - vec_res_value3_f = wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4); - vec_res_value4_f = wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4); - - vec_res_value1 = wrapper::vcvt(vec_res_value1_f); - vec_res_value2 = wrapper::vcvt(vec_res_value2_f); - vec_res_value3 = wrapper::vcvt(vec_res_value3_f); - vec_res_value4 = wrapper::vcvt(vec_res_value4_f); - - const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2)); - const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4)); - auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2)); - - wrapper::vstore(reinterpret_cast(output.ptr() + x), res); - break; + wrapper::vstore(reinterpret_cast(output.ptr() + x), res); + break; + } + case ReductionOperation::PROD: + { + const auto offset32x4f_4 = + wrapper::vdup_n(static_cast(iq_info.offset), wrapper::traits::vector_128_tag{}); + const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale)); + + //re-quantize + vec_res_value1_f = + wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4); + vec_res_value2_f = + wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4); + vec_res_value3_f = + wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4); + vec_res_value4_f = + wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4); + + vec_res_value1 = wrapper::vcvt(vec_res_value1_f); + vec_res_value2 = wrapper::vcvt(vec_res_value2_f); + vec_res_value3 = wrapper::vcvt(vec_res_value3_f); + vec_res_value4 = wrapper::vcvt(vec_res_value4_f); + + const auto temp16x8t_1 = + wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2)); + const auto temp16x8t_2 = + wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4)); + auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2)); + + wrapper::vstore(reinterpret_cast(output.ptr() + x), res); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); } - default: - ARM_COMPUTE_ERROR("Not supported"); } - } - // Compute left-over elements - for(; x < window_end_x; ++x) - { - float res_value = 0.f; - int32_t res_value_q = 0; - - switch(op) + // Compute left-over elements + for (; x < window_end_x; ++x) { - case ReductionOperation::ARG_IDX_MAX: - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::MIN: - case ReductionOperation::MAX: - { - res_value = *(input_ptr + x); - break; - } - case ReductionOperation::PROD: - { - res_value = static_cast(1.0f); - break; - } - default: - { - res_value = static_cast(0.0f); - break; - } - } - uint32_t res_idx = 0; + float res_value = 0.f; + int32_t res_value_q = 0; - for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) - { - const T *in_ptr = reinterpret_cast(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim); - switch(op) + switch (op) { - case ReductionOperation::SUM: + case ReductionOperation::ARG_IDX_MAX: + case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::MIN: + case ReductionOperation::MAX: { - res_value += *in_ptr; + res_value = *(input_ptr + x); break; } - case ReductionOperation::MEAN_SUM: + case ReductionOperation::PROD: { - res_value_q += *in_ptr; + res_value = static_cast(1.0f); break; } - case ReductionOperation::SUM_SQUARE: + default: { - res_value += *in_ptr * *in_ptr; + res_value = static_cast(0.0f); break; } - case ReductionOperation::PROD: + } + uint32_t res_idx = 0; + + for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim) + { + const T *in_ptr = + reinterpret_cast(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim); + switch (op) { - //de-quantize input - if(std::is_same::value) + case ReductionOperation::SUM: { - res_value *= dequantize_qasymm8(*in_ptr, iq_info); + res_value += *in_ptr; + break; } - else + case ReductionOperation::MEAN_SUM: { - res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info); + res_value_q += *in_ptr; + break; } - break; - } - case ReductionOperation::ARG_IDX_MIN: - { - if(*in_ptr < res_value) + case ReductionOperation::SUM_SQUARE: { - res_value = *in_ptr; - res_idx = dim; + res_value += *in_ptr * *in_ptr; + break; } - break; - } - case ReductionOperation::ARG_IDX_MAX: - { - if(*in_ptr > res_value) + case ReductionOperation::PROD: { - res_value = *in_ptr; - res_idx = dim; + //de-quantize input + if (std::is_same::value) + { + res_value *= dequantize_qasymm8(*in_ptr, iq_info); + } + else + { + res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info); + } + break; } - break; + case ReductionOperation::ARG_IDX_MIN: + { + if (*in_ptr < res_value) + { + res_value = *in_ptr; + res_idx = dim; + } + break; + } + case ReductionOperation::ARG_IDX_MAX: + { + if (*in_ptr > res_value) + { + res_value = *in_ptr; + res_idx = dim; + } + break; + } + case ReductionOperation::MIN: + { + res_value = *in_ptr < res_value ? *in_ptr : res_value; + break; + } + case ReductionOperation::MAX: + { + res_value = *in_ptr > res_value ? *in_ptr : res_value; + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); } - case ReductionOperation::MIN: + } + + switch (op) + { + case ReductionOperation::MEAN_SUM: { - res_value = *in_ptr < res_value ? *in_ptr : res_value; + // Apply previously calculated coefficients (with rounding on aarch64) +#ifdef __aarch64__ + const int32_t res = + arm_compute::support::cpp11::round(A * (static_cast(res_value_q)) + B); +#else // defined(__aarch64__) + const int32_t res = A * (static_cast(res_value_q)) + B; +#endif // __aarch64__ + *reinterpret_cast(output.ptr() + x) = utils::cast::saturate_cast(res); break; } - case ReductionOperation::MAX: + case ReductionOperation::SUM: { - res_value = *in_ptr > res_value ? *in_ptr : res_value; + // Subtract accumulated offsets + res_value -= (in_info.dimension(axis) - 1) * iq_info.offset; + *reinterpret_cast(output.ptr() + x) = utils::cast::saturate_cast(res_value); break; } - default: - ARM_COMPUTE_ERROR("Not supported"); - } - } - - switch(op) - { - case ReductionOperation::MEAN_SUM: - { - // Apply previously calculated coefficients (with rounding on aarch64) -#ifdef __aarch64__ - const int32_t res = arm_compute::support::cpp11::round(A * (static_cast(res_value_q)) + B); -#else // defined(__aarch64__) - const int32_t res = A * (static_cast(res_value_q)) + B; -#endif // __aarch64__ - *reinterpret_cast(output.ptr() + x) = utils::cast::saturate_cast(res); - break; - } - case ReductionOperation::SUM: - { - // Subtract accumulated offsets - res_value -= (in_info.dimension(axis) - 1) * iq_info.offset; - *reinterpret_cast(output.ptr() + x) = utils::cast::saturate_cast(res_value); - break; - } - case ReductionOperation::PROD: - { - //re-quantize result - T res = 0; - if(std::is_same::value) + case ReductionOperation::PROD: { - res = quantize_qasymm8(res_value, iq_info); + //re-quantize result + T res = 0; + if (std::is_same::value) + { + res = quantize_qasymm8(res_value, iq_info); + } + else + { + res = quantize_qasymm8_signed(res_value, iq_info); + } + *(reinterpret_cast(output.ptr() + x)) = res; + break; } - else + case ReductionOperation::ARG_IDX_MIN: + case ReductionOperation::ARG_IDX_MAX: { - res = quantize_qasymm8_signed(res_value, iq_info); + *(reinterpret_cast(output.ptr() + x * 4)) = res_idx; + break; } - *(reinterpret_cast(output.ptr() + x)) = res; - break; - } - case ReductionOperation::ARG_IDX_MIN: - case ReductionOperation::ARG_IDX_MAX: - { - *(reinterpret_cast(output.ptr() + x * 4)) = res_idx; - break; + default: + *(reinterpret_cast(output.ptr() + x)) = res_value; } - default: - *(reinterpret_cast(output.ptr() + x)) = res_value; } - } - }, - input, output); + }, + input, output); } }; -void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op) +void reduce_op( + const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op) { const bool is_complex = (input->info()->num_channels() == 2); - if(is_complex) + if (is_complex) { - switch(axis) + switch (axis) { case 2: - switch(input->info()->data_type()) + switch (input->info()->data_type()) { case DataType::F32: - switch(op) + switch (op) { case ReductionOperation::SUM: - return Reducer>::reduceZ(window, input, output, RedOpYZW_complex(), op); + return Reducer>::reduceZ( + window, input, output, RedOpYZW_complex(), + op); default: ARM_COMPUTE_ERROR("Not supported"); } @@ -1602,19 +1669,21 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi return; } - switch(axis) + switch (axis) { case 0: { - switch(input->info()->data_type()) + switch (input->info()->data_type()) { case DataType::QASYMM8: { - return Reducer>::reduceX(window, input, output, RedOpX_quantized(), op); + return Reducer>::reduceX(window, input, output, + RedOpX_quantized(), op); } case DataType::QASYMM8_SIGNED: { - return Reducer>::reduceX(window, input, output, RedOpX_quantized(), op); + return Reducer>::reduceX(window, input, output, RedOpX_quantized(), + op); } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: @@ -1635,19 +1704,22 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi } } case 1: - switch(input->info()->data_type()) + switch (input->info()->data_type()) { case DataType::QASYMM8: { - return Reducer>::reduceY(window, input, output, RedOpYZW_quantized(), op); + return Reducer>::reduceY(window, input, output, + RedOpYZW_quantized(), op); } case DataType::QASYMM8_SIGNED: { - return Reducer>::reduceY(window, input, output, RedOpYZW_quantized(), op); + return Reducer>::reduceY(window, input, output, + RedOpYZW_quantized(), op); } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - return Reducer>::reduceY(window, input, output, RedOpYZW(), op); + return Reducer>::reduceY(window, input, output, RedOpYZW(), + op); #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: return Reducer>::reduceY(window, input, output, RedOpYZW(), op); @@ -1657,15 +1729,18 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi ARM_COMPUTE_ERROR("Not supported"); } case 2: - switch(input->info()->data_type()) + switch (input->info()->data_type()) { case DataType::QASYMM8: - return Reducer>::reduceZ(window, input, output, RedOpYZW_quantized(), op); + return Reducer>::reduceZ(window, input, output, + RedOpYZW_quantized(), op); case DataType::QASYMM8_SIGNED: - return Reducer>::reduceZ(window, input, output, RedOpYZW_quantized(), op); + return Reducer>::reduceZ(window, input, output, + RedOpYZW_quantized(), op); #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - return Reducer>::reduceZ(window, input, output, RedOpYZW(), op); + return Reducer>::reduceZ(window, input, output, RedOpYZW(), + op); #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: return Reducer>::reduceZ(window, input, output, RedOpYZW(), op); @@ -1675,15 +1750,18 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi ARM_COMPUTE_ERROR("Not supported"); } case 3: - switch(input->info()->data_type()) + switch (input->info()->data_type()) { case DataType::QASYMM8: - return Reducer>::reduceW(window, input, output, RedOpYZW_quantized(), op); + return Reducer>::reduceW(window, input, output, + RedOpYZW_quantized(), op); case DataType::QASYMM8_SIGNED: - return Reducer>::reduceW(window, input, output, RedOpYZW_quantized(), op); + return Reducer>::reduceW(window, input, output, + RedOpYZW_quantized(), op); #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - return Reducer>::reduceW(window, input, output, RedOpYZW(), op); + return Reducer>::reduceW(window, input, output, RedOpYZW(), + op); #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F32: return Reducer>::reduceW(window, input, output, RedOpYZW(), op); @@ -1704,9 +1782,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - if(input->num_channels() == 1) + if (input->num_channels() == 1) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::S32, DataType::F16, DataType::F32); } else { @@ -1715,13 +1794,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u ARM_COMPUTE_RETURN_ERROR_ON(axis != 2); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); - if(output->total_size() != 0) + if (output->total_size() != 0) { bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN); - if(!is_arg_min_max) + if (!is_arg_min_max) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels()); @@ -1731,8 +1811,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32); } - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis); - const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis); + const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped); } @@ -1745,7 +1826,10 @@ NEReductionOperationKernel::NEReductionOperationKernel() { } -void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op) +void NEReductionOperationKernel::configure(const ITensor *input, + ITensor *output, + unsigned int axis, + ReductionOperation op) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -1761,14 +1845,23 @@ void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output INEKernel::configure(win); // Calculate output shape and set if empty - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis); // Output auto initialization if not yet initialized const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX); DataType output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type(); - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true)); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_tensor_shape(output_shape) + .set_data_type(output_data_type) + .reset_padding() + .set_is_resizable(true)); } -Status NEReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op) +Status NEReductionOperationKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + unsigned int axis, + ReductionOperation op) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op)); diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.h b/src/core/NEON/kernels/NEReductionOperationKernel.h index 08e654fd21..78bec62c14 100644 --- a/src/core/NEON/kernels/NEReductionOperationKernel.h +++ b/src/core/NEON/kernels/NEReductionOperationKernel.h @@ -77,7 +77,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEReorderKernel.cpp b/src/core/NEON/kernels/NEReorderKernel.cpp index 1a7f58bb08..f92a4c87da 100644 --- a/src/core/NEON/kernels/NEReorderKernel.cpp +++ b/src/core/NEON/kernels/NEReorderKernel.cpp @@ -24,11 +24,13 @@ #if defined(__aarch64__) #include "src/core/NEON/kernels/NEReorderKernel.h" -#include "src/common/utils/Log.h" -#include "src/core/NEON/kernels/arm_gemm/transform.hpp" + #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" +#include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/arm_gemm/transform.hpp" + namespace arm_compute { @@ -37,29 +39,32 @@ void NEReorderKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - switch(_input->info()->data_type()) + switch (_input->info()->data_type()) { case DataType::F32: { const int ksize_rows_elements = _xmax * _ksize; - const int jump_rows = ksize_rows_elements * window.x().start(); - const int k_start = window.x().start() * _ksize; - const int k_end = std::min(window.x().end() * _ksize, _kmax); - const int stride = _kmax; - if(k_start < k_end) + const int jump_rows = ksize_rows_elements * window.x().start(); + const int k_start = window.x().start() * _ksize; + const int k_end = std::min(window.x().end() * _ksize, _kmax); + const int stride = _kmax; + if (k_start < k_end) { - - switch(_output_wf) + switch (_output_wf) { case WeightFormat::OHWIo4: { - arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>(reinterpret_cast(_output->buffer()) + jump_rows, reinterpret_cast(_input->buffer()), stride, k_start, k_end, 0, _xmax); + arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>( + reinterpret_cast(_output->buffer()) + jump_rows, + reinterpret_cast(_input->buffer()), stride, k_start, k_end, 0, _xmax); break; } #if defined(ARM_COMPUTE_ENABLE_SVE) case WeightFormat::OHWIo8: { - arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>(reinterpret_cast(_output->buffer()) + jump_rows, reinterpret_cast(_input->buffer()), stride, k_start, k_end, 0, _xmax); + arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>( + reinterpret_cast(_output->buffer()) + jump_rows, + reinterpret_cast(_input->buffer()), stride, k_start, k_end, 0, _xmax); break; } #endif /* ARM_COMPUTE_ENABLE_SVE */ @@ -78,11 +83,20 @@ void NEReorderKernel::run(const Window &window, const ThreadInfo &info) } NEReorderKernel::NEReorderKernel() - : _input(nullptr), _output(nullptr), _ksize(0), _kmax(0), _xmax(0), _input_wf(WeightFormat::ANY), _output_wf(WeightFormat::ANY) + : _input(nullptr), + _output(nullptr), + _ksize(0), + _kmax(0), + _xmax(0), + _input_wf(WeightFormat::ANY), + _output_wf(WeightFormat::ANY) { } -void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf) +void NEReorderKernel::configure(const ITensor *input, + ITensor *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf) { ARM_COMPUTE_LOG_PARAMS(input, output, input_wf, output_wf); ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -96,7 +110,7 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu // Setting parameters for transform auto dims = input->info()->num_dimensions(); - switch(dims) + switch (dims) { case 2: { @@ -120,7 +134,7 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu // Window size is set by rows / _ksize Window win; int window_size = 0; - switch(_output_wf) + switch (_output_wf) { #if defined(ARM_COMPUTE_ENABLE_SVE) case WeightFormat::OHWIo8: @@ -142,7 +156,7 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu break; } } - if(_kmax % _ksize != 0) + if (_kmax % _ksize != 0) { window_size += 1; } @@ -152,11 +166,14 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu INEKernel::configure(win); } -Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf) +Status NEReorderKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); - if(output->tensor_shape().total_size() != 0) + if (output->tensor_shape().total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); @@ -167,20 +184,20 @@ Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *ou int output_x_dim; int output_k_dim; auto dims = output->num_dimensions(); - switch(dims) + switch (dims) { case 2: { - input_x_dim = input->dimension(0); // Number of columns in input matrix - input_k_dim = input->dimension(1); // Number of rows in input matrix + input_x_dim = input->dimension(0); // Number of columns in input matrix + input_k_dim = input->dimension(1); // Number of rows in input matrix output_x_dim = output->dimension(0); // Number of columns in output matrix output_k_dim = output->dimension(1); // Number of rows in output matrix break; } case 4: { - input_x_dim = input->dimension(2); // Number of columns in input matrix - input_k_dim = input->dimension(3); // Number of rows in input matrix + input_x_dim = input->dimension(2); // Number of columns in input matrix + input_k_dim = input->dimension(3); // Number of rows in input matrix output_x_dim = output->dimension(2); // Number of columns in output matrix output_k_dim = output->dimension(3); // Number of rows in output matrix break; @@ -192,7 +209,7 @@ Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *ou } int ksize; - switch(output_wf) + switch (output_wf) { case WeightFormat::OHWIo8: { @@ -216,11 +233,10 @@ Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *ou ARM_COMPUTE_RETURN_ERROR_ON(rnd_up_input_kdim != output_k_dim); // output x_dim needs to be same as input ARM_COMPUTE_RETURN_ERROR_ON(input_x_dim != output_x_dim); - } return Status{}; } } // namespace arm_compute -#endif // defined(__aarch64__) \ No newline at end of file +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/NEReorderKernel.h b/src/core/NEON/kernels/NEReorderKernel.h index 07908890f4..4528b25245 100644 --- a/src/core/NEON/kernels/NEReorderKernel.h +++ b/src/core/NEON/kernels/NEReorderKernel.h @@ -26,9 +26,10 @@ #ifndef ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL #define ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL -#include "src/core/NEON/INEKernel.h" #include "arm_compute/core/Types.h" +#include "src/core/NEON/INEKernel.h" + namespace arm_compute { @@ -36,7 +37,6 @@ namespace arm_compute class NEReorderKernel : public INEKernel { public: - const char *name() const override { return "NEReorderKernel"; @@ -62,7 +62,10 @@ public: * @param[in] input_wf WeightFormat of input. * @param[in] output_wf WeightFormat of output. */ - void configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf); + void configure(const ITensor *input, + ITensor *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf); /** Static function to check if given info will lead to a valid configuration of @ref NEReorderKernel * @@ -73,25 +76,27 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; - -/*****************************************************************************/ + /*****************************************************************************/ private: - const ITensor *_input{nullptr}; // Input tensor - ITensor *_output{nullptr}; // Output tensor - int32_t _ksize{0}; // Blocking parameter, how many rows kernel reorders on each call - int32_t _kmax{0}; // Rows in input tensor - int32_t _xmax{0}; // Columns in input tensor - WeightFormat _input_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of input tensor - WeightFormat _output_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of output tensor + const ITensor *_input{nullptr}; // Input tensor + ITensor *_output{nullptr}; // Output tensor + int32_t _ksize{0}; // Blocking parameter, how many rows kernel reorders on each call + int32_t _kmax{0}; // Rows in input tensor + int32_t _xmax{0}; // Columns in input tensor + WeightFormat _input_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of input tensor + WeightFormat _output_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of output tensor }; } // namespace arm_compute #endif /* ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL */ -#endif // defined(__aarch64__) \ No newline at end of file +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.cpp b/src/core/NEON/kernels/NEReorgLayerKernel.cpp index a7b830c066..227570405c 100644 --- a/src/core/NEON/kernels/NEReorgLayerKernel.cpp +++ b/src/core/NEON/kernels/NEReorgLayerKernel.cpp @@ -28,8 +28,9 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -50,13 +51,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, + "The width of the input tensor must be a multiple of stride"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, + "The height of the input tensor must be a multiple of stride"); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride)); + const TensorInfo tensor_info_output = + output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -65,8 +69,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i } } // namespace -NEReorgLayerKernel::NEReorgLayerKernel() - : _input(nullptr), _output(nullptr), _stride(1) +NEReorgLayerKernel::NEReorgLayerKernel() : _input(nullptr), _output(nullptr), _stride(1) { } @@ -121,23 +124,26 @@ void NEReorgLayerKernel::run(const Window &window, const ThreadInfo &info) Iterator out(_output, collapsed_window); // Perform reorg - execute_window_loop(collapsed_window, [&](const Coordinates & id) - { - // Get spatial coords and channels - const unsigned int w = id[idx_w]; - const unsigned int h = id[idx_h]; - const unsigned int c = id[idx_c]; - - // Calculate mapping - const unsigned int offset = c / out_c; - Coordinates map_coords = id; - map_coords.set(idx_w, w * stride + offset % stride); - map_coords.set(idx_h, h * stride + offset / stride); - map_coords.set(idx_c, c % out_c); - - // Perform mapping - std::memcpy(out.ptr(), in_ptr + _input->info()->offset_element_in_bytes(map_coords), _input->info()->element_size()); - }, - out); + execute_window_loop( + collapsed_window, + [&](const Coordinates &id) + { + // Get spatial coords and channels + const unsigned int w = id[idx_w]; + const unsigned int h = id[idx_h]; + const unsigned int c = id[idx_c]; + + // Calculate mapping + const unsigned int offset = c / out_c; + Coordinates map_coords = id; + map_coords.set(idx_w, w * stride + offset % stride); + map_coords.set(idx_h, h * stride + offset / stride); + map_coords.set(idx_c, c % out_c); + + // Perform mapping + std::memcpy(out.ptr(), in_ptr + _input->info()->offset_element_in_bytes(map_coords), + _input->info()->element_size()); + }, + out); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp index ca6c117882..d2437eecd0 100644 --- a/src/core/NEON/kernels/NEReverseKernel.cpp +++ b/src/core/NEON/kernels/NEReverseKernel.cpp @@ -26,15 +26,17 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis) +Status +validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis) { ARM_COMPUTE_UNUSED(use_inverted_axis); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, axis); @@ -42,11 +44,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Current implementation only supports up to 4 dimensions."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, + "Current implementation only supports up to 4 dimensions."); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed"); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -57,8 +60,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c } } // namespace -NEReverseKernel::NEReverseKernel() - : _input(nullptr), _output(nullptr), _axis(nullptr), _use_inverted_axis(false) +NEReverseKernel::NEReverseKernel() : _input(nullptr), _output(nullptr), _axis(nullptr), _use_inverted_axis(false) { } @@ -80,7 +82,10 @@ void NEReverseKernel::configure(const ITensor *input, ITensor *output, const ITe INEKernel::configure(calculate_max_window(*output->info())); } -Status NEReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis) +Status NEReverseKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *axis, + bool use_inverted_axis) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, use_inverted_axis)); @@ -88,29 +93,30 @@ Status NEReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *ou } template -void run_reverse(const Window &window, const ITensor *input, const ITensor *axis, ITensor *output, bool use_inverted_axis) +void run_reverse( + const Window &window, const ITensor *input, const ITensor *axis, ITensor *output, bool use_inverted_axis) { unsigned int axis_bit = 0; const int rank = input->info()->num_dimensions(); - for(unsigned int i = 0; i < axis->info()->dimension(0); ++i) + for (unsigned int i = 0; i < axis->info()->dimension(0); ++i) { int axis_i = *(reinterpret_cast(axis->buffer()) + i); // The values of axis tensor must be between [-rank, rank-1]. - if((axis_i < -rank) || (axis_i >= rank)) + if ((axis_i < -rank) || (axis_i >= rank)) { ARM_COMPUTE_ERROR("the valuses of the axis tensor must be within [-rank, rank-1]."); } // In case of negative axis value i.e targeted axis(i) = rank + axis(i) - if(axis_i < 0) + if (axis_i < 0) { axis_i = rank + axis_i; } // Reverse ACL axis indices convention i.e. (inverted)axis = (tensor_rank - 1) - axis - if(use_inverted_axis) + if (use_inverted_axis) { axis_i = (rank - 1) - axis_i; } @@ -127,43 +133,47 @@ void run_reverse(const Window &window, const ITensor *input, const ITensor *axis win.set(Window::DimX, Window::Dimension(0, 1, 1)); Iterator input_it(input, win); - execute_window_loop(win, [&](const Coordinates & id) - { - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win, + [&](const Coordinates &id) { - auto in = wrapper::vloadq(reinterpret_cast(input_it.ptr()) + x); - - // Reverse 0 axis - if(axis_bit & 0x1) + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - in = wrapper::vrev64(in); - in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in)); + auto in = wrapper::vloadq(reinterpret_cast(input_it.ptr()) + x); + + // Reverse 0 axis + if (axis_bit & 0x1) + { + in = wrapper::vrev64(in); + in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in)); + } + + const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - window_step_x : x; + const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y(); + const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z(); + const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3]; + + auto out_ptr = + reinterpret_cast(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))); + wrapper::vstore(out_ptr, in); } - const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - window_step_x : x; - const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y(); - const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z(); - const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3]; - - auto out_ptr = reinterpret_cast(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))); - wrapper::vstore(out_ptr, in); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - const auto in = *(reinterpret_cast(input_it.ptr()) + x); + // Compute left-over elements + for (; x < window_end_x; ++x) + { + const auto in = *(reinterpret_cast(input_it.ptr()) + x); - const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - 1 : x; - const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y(); - const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z(); - const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3]; + const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - 1 : x; + const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y(); + const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z(); + const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3]; - *reinterpret_cast(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))) = in; - } - }, - input_it); + *reinterpret_cast(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))) = + in; + } + }, + input_it); } void NEReverseKernel::run(const Window &window, const ThreadInfo &info) @@ -172,7 +182,7 @@ void NEReverseKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - switch(_input->info()->element_size()) + switch (_input->info()->element_size()) { case 4: run_reverse(window, _input, _axis, _output, _use_inverted_axis); diff --git a/src/core/NEON/kernels/NEReverseKernel.h b/src/core/NEON/kernels/NEReverseKernel.h index 7d9ec4691c..92261887f4 100644 --- a/src/core/NEON/kernels/NEReverseKernel.h +++ b/src/core/NEON/kernels/NEReverseKernel.h @@ -68,7 +68,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp index b8c9b244ee..7789b828ea 100644 --- a/src/core/NEON/kernels/NESelectKernel.cpp +++ b/src/core/NEON/kernels/NESelectKernel.cpp @@ -29,13 +29,12 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + +#include "src/core/common/Registrars.h" #include "src/core/CPP/Validate.h" -#include "src/core/NEON/wrapper/wrapper.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - -#include "src/core/common/Registrars.h" - +#include "src/core/NEON/wrapper/wrapper.h" #include "src/cpu/kernels/select/list.h" #include @@ -54,7 +53,8 @@ struct SelectKernelSelectorData }; using SelectorPtr = std::add_pointer::type; -using KernelPtr = std::add_pointer::type; +using KernelPtr = + std::add_pointer::type; struct SelectKernelSelector { @@ -63,95 +63,62 @@ struct SelectKernelSelector KernelPtr ukernel; }; -static const SelectKernelSelector available_kernels[] = -{ - { - "neon_s8_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == true; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_same_rank) - }, - { - "neon_s16_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == true; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_same_rank) - }, - { - "neon_s32_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == true; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_same_rank) - }, - { - "neon_u8_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == true; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_same_rank) - }, - { - "neon_u16_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == true; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_same_rank) - }, - { - "neon_u32_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == true; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_same_rank) - }, - { - "neon_s8_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == false; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_not_same_rank) - }, - { - "neon_s16_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == false; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_not_same_rank) - }, - { - "neon_s32_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == false; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_not_same_rank) - }, - { - "neon_u8_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == false; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_not_same_rank) - }, - { - "neon_u16_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == false; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_not_same_rank) - }, - { - "neon_u32_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == false; }, - REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_not_same_rank) - }, - { - "neon_f16_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == true; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_same_rank) - }, - { - "neon_f16_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == false; }, - REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_not_same_rank) - }, - { - "neon_f32_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == true; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_same_rank) - }, - { - "neon_f32_not_same_rank", - [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == false; }, - REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_not_same_rank) - }, +static const SelectKernelSelector available_kernels[] = { + {"neon_s8_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::S8 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_same_rank)}, + {"neon_s16_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::S16 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_same_rank)}, + {"neon_s32_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::S32 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_same_rank)}, + {"neon_u8_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::U8 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_same_rank)}, + {"neon_u16_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::U16 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_same_rank)}, + {"neon_u32_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::U32 && data.is_same_rank == true; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_same_rank)}, + {"neon_s8_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::S8 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_not_same_rank)}, + {"neon_s16_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::S16 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_not_same_rank)}, + {"neon_s32_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::S32 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_not_same_rank)}, + {"neon_u8_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::U8 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_not_same_rank)}, + {"neon_u16_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::U16 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_not_same_rank)}, + {"neon_u32_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::U32 && data.is_same_rank == false; }, + REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_not_same_rank)}, + {"neon_f16_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::F16 && data.is_same_rank == true; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_same_rank)}, + {"neon_f16_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::F16 && data.is_same_rank == false; }, + REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_not_same_rank)}, + {"neon_f32_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::F32 && data.is_same_rank == true; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_same_rank)}, + {"neon_f32_not_same_rank", + [](const SelectKernelSelectorData &data) { return data.dt == DataType::F32 && data.is_same_rank == false; }, + REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_not_same_rank)}, }; const SelectKernelSelector *get_implementation(const SelectKernelSelectorData &data) { - for(const auto &uk : available_kernels) + for (const auto &uk : available_kernels) { - if(uk.is_selected(data)) + if (uk.is_selected(data)) { return &uk; } @@ -184,7 +151,8 @@ void NESelectKernel::configure(const ITensor *c, const ITensor *x, const ITensor INEKernel::configure(win); } -Status NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output) +Status +NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(c, x, y); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(x); @@ -195,9 +163,11 @@ Status NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, cons const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape())); - ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1]))); + ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && + ((c->tensor_shape().num_dimensions() > 1) || + (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1]))); - if(output != nullptr && output->total_size() != 0) + if (output != nullptr && output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output); @@ -214,7 +184,7 @@ void NESelectKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON(_output == nullptr); ARM_COMPUTE_ERROR_ON(_output->info() == nullptr); - const auto *uk = get_implementation(SelectKernelSelectorData{ _output->info()->data_type(), _has_same_rank }); + const auto *uk = get_implementation(SelectKernelSelectorData{_output->info()->data_type(), _has_same_rank}); ARM_COMPUTE_ERROR_ON(uk == nullptr); ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr); uk->ukernel(_c, _x, _y, _output, window); diff --git a/src/core/NEON/kernels/NESelectKernel.h b/src/core/NEON/kernels/NESelectKernel.h index e82105a68e..4fec42b536 100644 --- a/src/core/NEON/kernels/NESelectKernel.h +++ b/src/core/NEON/kernels/NESelectKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NESELECTKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -82,7 +83,6 @@ public: void run(const Window &window, const ThreadInfo &info) override; private: - const ITensor *_c; /**< Condition tensor */ const ITensor *_x; /**< Source tensor 1 */ const ITensor *_y; /**< Source tensor 2 */ diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp index 673eace3c1..da023aeb96 100644 --- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp +++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp @@ -26,11 +26,12 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include #include @@ -41,19 +42,22 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *paddings, const ITensorInfo *output) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *block_info, + const ITensorInfo *paddings, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, paddings, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{ 2 }); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{2}); ARM_COMPUTE_RETURN_ERROR_ON(paddings->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{ 2, 2 }); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{2, 2}); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { const DataLayout data_layout = input->data_layout(); const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); @@ -64,7 +68,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf return Status{}; } -Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, +Status validate_arguments_static(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); @@ -73,9 +81,10 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { - TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(input, block_shape_x, block_shape_y, padding_left, padding_right); + TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape( + input, block_shape_x, block_shape_y, padding_left, padding_right); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); @@ -86,14 +95,25 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape } // namespace NESpaceToBatchLayerKernel::NESpaceToBatchLayerKernel() - : _input(nullptr), _block_shape(nullptr), _paddings(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _padding_left(), _block_shape_x(), _block_shape_y() + : _input(nullptr), + _block_shape(nullptr), + _paddings(nullptr), + _output(nullptr), + _data_layout(DataLayout::UNKNOWN), + _padding_left(), + _block_shape_x(), + _block_shape_y() { } -void NESpaceToBatchLayerKernel::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output) +void NESpaceToBatchLayerKernel::configure(const ITensor *input, + const ITensor *block_shape, + const ITensor *paddings, + ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info())); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info())); _input = input; _block_shape = block_shape; @@ -106,15 +126,22 @@ void NESpaceToBatchLayerKernel::configure(const ITensor *input, const ITensor *b ICPPKernel::configure(win); } -void NESpaceToBatchLayerKernel::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, - ITensor *output) +void NESpaceToBatchLayerKernel::configure(const ITensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right); - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info()); + TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape( + input->info(), block_shape_x, block_shape_y, padding_left, padding_right); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info())); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, + padding_right, output->info())); _input = input; _output = output; @@ -128,15 +155,23 @@ void NESpaceToBatchLayerKernel::configure(const ITensor *input, const int block_ INEKernel::configure(win); } -Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output) +Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output)); return Status{}; } -Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, +Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); return Status{}; } @@ -146,17 +181,17 @@ void NESpaceToBatchLayerKernel::run(const Window &window, const ThreadInfo &info ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window); - if(_block_shape != nullptr) + if (_block_shape != nullptr) { // Retrieve the block shapes dynamically _block_shape_x = *(reinterpret_cast(_block_shape->ptr_to_element(0))); _block_shape_y = *(reinterpret_cast(_block_shape->ptr_to_element(1))); } - if(_paddings != nullptr) + if (_paddings != nullptr) { - const size_t pad_left_x = *reinterpret_cast(_paddings->ptr_to_element({ 0, 0 })); - const size_t pad_left_y = *reinterpret_cast(_paddings->ptr_to_element({ 1, 0 })); + const size_t pad_left_x = *reinterpret_cast(_paddings->ptr_to_element({0, 0})); + const size_t pad_left_y = *reinterpret_cast(_paddings->ptr_to_element({1, 0})); _padding_left = Size2D(pad_left_x, pad_left_y); } const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); @@ -173,57 +208,61 @@ void NESpaceToBatchLayerKernel::run(const Window &window, const ThreadInfo &info int batch_id = 0; // Main loop for NCHW and NHWC - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { do { Iterator out(_output, slice_out); - execute_window_loop(slice_out, [&](const Coordinates & id) - { - const size_t out_x = id.x(); - const size_t out_y = id.y(); - const size_t z = id.z(); - const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x; - const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x; - if(pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width) + execute_window_loop( + slice_out, + [&](const Coordinates &id) { - const int w = batch_id % batch_size; - const int in_x = pos_x - _padding_left.x(); - const int in_y = pos_y - _padding_left.y(); - Coordinates input_coords{ in_x, in_y, z, w }; - memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); - } - }, - out); + const size_t out_x = id.x(); + const size_t out_y = id.y(); + const size_t z = id.z(); + const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x; + const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x; + if (pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && + pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width) + { + const int w = batch_id % batch_size; + const int in_x = pos_x - _padding_left.x(); + const int in_y = pos_y - _padding_left.y(); + Coordinates input_coords{in_x, in_y, z, w}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); + } + }, + out); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } else { do { Iterator out(_output, slice_out); - execute_window_loop(slice_out, [&](const Coordinates & id) - { - const size_t out_x = id.y(); - const size_t out_y = id.z(); - const size_t z = id.x(); - const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x; - const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x; - if(pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width) + execute_window_loop( + slice_out, + [&](const Coordinates &id) { - const int w = batch_id % batch_size; - const int in_x = pos_x - _padding_left.x(); - const int in_y = pos_y - _padding_left.y(); - Coordinates input_coords{ z, in_x, in_y, w }; - memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); - } - }, - out); + const size_t out_x = id.y(); + const size_t out_y = id.z(); + const size_t z = id.x(); + const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x; + const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x; + if (pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && + pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width) + { + const int w = batch_id % batch_size; + const int in_x = pos_x - _padding_left.x(); + const int in_y = pos_y - _padding_left.y(); + Coordinates input_coords{z, in_x, in_y, w}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); + } + }, + out); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h index 44b8cbb514..6292c07136 100644 --- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h +++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NESPACETOBATCHLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -69,7 +70,12 @@ public: * @param[in] padding_right The padding at the end of every dimension of the output tensor. * @param[out] output Tensor output. Data types supported: same as @p input */ - void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output); + void configure(const ITensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -79,7 +85,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output); /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel (Static block shape and paddings) * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. @@ -91,7 +100,12 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + const ITensorInfo *output); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp index 7687c50c40..b49c5ee344 100644 --- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp +++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp @@ -26,11 +26,12 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "src/core/NEON/wrapper/wrapper.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" #include #include @@ -50,7 +51,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { const DataLayout data_layout = input->data_layout(); const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); @@ -115,43 +116,45 @@ void NESpaceToDepthLayerKernel::run(const Window &window, const ThreadInfo &info int batch_id = 0; // Main loop for NCHW and NHWC - if(_data_layout == DataLayout::NCHW) + if (_data_layout == DataLayout::NCHW) { do { Iterator out(_output, slice_out); - execute_window_loop(slice_out, [&](const Coordinates & id) - { - const size_t channel_id = id.z(); - const size_t in_x = id.x() * _block_shape + (channel_id / channel_size) % _block_shape; - const size_t in_y = id.y() * _block_shape + (channel_id / channel_size) / _block_shape; - const int z = channel_id % channel_size; - Coordinates input_coords{ in_x, in_y, z, batch_id }; - memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); - }, - out); + execute_window_loop( + slice_out, + [&](const Coordinates &id) + { + const size_t channel_id = id.z(); + const size_t in_x = id.x() * _block_shape + (channel_id / channel_size) % _block_shape; + const size_t in_y = id.y() * _block_shape + (channel_id / channel_size) / _block_shape; + const int z = channel_id % channel_size; + Coordinates input_coords{in_x, in_y, z, batch_id}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); + }, + out); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } else { do { Iterator out(_output, slice_out); - execute_window_loop(slice_out, [&](const Coordinates & id) - { - const size_t channel_id = id.x(); - const size_t in_x = id.y() * _block_shape + (channel_id / channel_size) % _block_shape; - const size_t in_y = id.z() * _block_shape + (channel_id / channel_size) / _block_shape; - const int z = channel_id % channel_size; - Coordinates input_coords{ z, in_x, in_y, batch_id }; - memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); - }, - out); + execute_window_loop( + slice_out, + [&](const Coordinates &id) + { + const size_t channel_id = id.x(); + const size_t in_x = id.y() * _block_shape + (channel_id / channel_size) % _block_shape; + const size_t in_y = id.z() * _block_shape + (channel_id / channel_size) / _block_shape; + const int z = channel_id % channel_size; + Coordinates input_coords{z, in_x, in_y, batch_id}; + memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size); + }, + out); ++batch_id; - } - while(window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice_out)); } } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h index 953b68a401..7d147c5b94 100644 --- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h +++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute diff --git a/src/core/NEON/kernels/NEStackLayerKernel.cpp b/src/core/NEON/kernels/NEStackLayerKernel.cpp index 93080e2ac7..e23b40a9aa 100644 --- a/src/core/NEON/kernels/NEStackLayerKernel.cpp +++ b/src/core/NEON/kernels/NEStackLayerKernel.cpp @@ -25,13 +25,13 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -41,7 +41,11 @@ using namespace arm_compute::misc::shape_calculator; namespace { -Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output) +Status validate_arguments(const ITensorInfo *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions. @@ -50,9 +54,10 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); - if(output->total_size() != 0) + if (output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), + compute_stack_shape(*input, axis, num_tensors)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); } @@ -60,7 +65,8 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned return Status{}; } -std::pair validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output) +std::pair +validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output) { // Output auto inizialitation if not yet initialized auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors))); @@ -71,11 +77,12 @@ std::pair validate_and_configure_window(ITensorInfo *input, unsi return std::make_pair(Status{}, win); } -inline Coordinates shift_from_axis_and_replace_coordinate(const Coordinates &id, unsigned int axis, unsigned int idx_input) +inline Coordinates +shift_from_axis_and_replace_coordinate(const Coordinates &id, unsigned int axis, unsigned int idx_input) { constexpr int max_out_coord = 5; // Input shape is max a 4D shape, output is max 5D Coordinates id_out = id; - for(unsigned int i = max_out_coord - 1; i > axis; --i) + for (unsigned int i = max_out_coord - 1; i > axis; --i) { id_out.set(i, id[i - 1]); } @@ -84,12 +91,12 @@ inline Coordinates shift_from_axis_and_replace_coordinate(const Coordinates &id, } } // namespace -NEStackLayerKernel::NEStackLayerKernel() - : _input(nullptr), _output(nullptr), _axis(), _idx_input() +NEStackLayerKernel::NEStackLayerKernel() : _input(nullptr), _output(nullptr), _axis(), _idx_input() { } -void NEStackLayerKernel::configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output) +void NEStackLayerKernel::configure( + const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info())); @@ -106,10 +113,15 @@ void NEStackLayerKernel::configure(const ITensor *input, unsigned int axis, unsi INEKernel::configure(win_config.second); } -Status NEStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output) +Status NEStackLayerKernel::validate(const ITensorInfo *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first); return Status{}; } @@ -131,12 +143,15 @@ void NEStackLayerKernel::run(const Window &window, const ThreadInfo &info) const int stride_w = _output->info()->num_dimensions() >= 3 ? _output->info()->strides_in_bytes()[3] : 0; const int stride_k = _output->info()->num_dimensions() >= 4 ? _output->info()->strides_in_bytes()[4] : 0; - execute_window_loop(window, [&](const Coordinates & id) - { - Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input); - const int idx = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w + id_out[4] * stride_k; - std::memcpy(output.ptr() + idx, input.ptr(), _input->info()->element_size()); - }, - input); + execute_window_loop( + window, + [&](const Coordinates &id) + { + Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input); + const int idx = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w + + id_out[4] * stride_k; + std::memcpy(output.ptr() + idx, input.ptr(), _input->info()->element_size()); + }, + input); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEStackLayerKernel.h b/src/core/NEON/kernels/NEStackLayerKernel.h index 9b36518e4d..685812b56d 100644 --- a/src/core/NEON/kernels/NEStackLayerKernel.h +++ b/src/core/NEON/kernels/NEStackLayerKernel.h @@ -26,6 +26,7 @@ #define ARM_COMPUTE_NESTACKLAYERKERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" namespace arm_compute @@ -64,7 +65,8 @@ public: * @param[out] output Output tensor. Data types supported: Same as @p input. * */ - void configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output); + void configure( + const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NEStackLayerKernel * * @note Supported input tensor rank: up to 4 @@ -78,7 +80,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + unsigned int axis, + unsigned int idx_input, + unsigned int num_tensors, + const ITensorInfo *output); // Inherited methods overridden void run(const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.cpp b/src/core/NEON/kernels/NEStridedSliceKernel.cpp index 2b406a8b8b..efff51be9d 100644 --- a/src/core/NEON/kernels/NEStridedSliceKernel.cpp +++ b/src/core/NEON/kernels/NEStridedSliceKernel.cpp @@ -26,9 +26,10 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Window.h" #include "arm_compute/core/utils/helpers/tensor_transform.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Window.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -38,9 +39,14 @@ namespace arm_compute { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); @@ -49,19 +55,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions()); - ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) - { - return i == 0; - })); + ARM_COMPUTE_RETURN_ERROR_ON( + std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) { return i == 0; })); // Get expected output shape - const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input, - starts, ends, strides, - begin_mask, end_mask, shrink_axis_mask); + const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape( + *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0); // Checks output if configured - if(output->total_size() != 0) + if (output->total_size() != 0) { const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info); @@ -71,14 +74,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, return Status{}; } -std::pair validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +std::pair validate_and_configure_window(const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { // Output tensor auto initialization if not yet initialized - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input, - starts, ends, strides, - begin_mask, end_mask, shrink_axis_mask); + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape( + *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape)); // Create window @@ -88,38 +95,49 @@ std::pair validate_and_configure_window(const ITensorInfo *input } } // namespace -NEStridedSliceKernel::NEStridedSliceKernel() - : _starts_abs(), _final_strides(), _shrink_mask() +NEStridedSliceKernel::NEStridedSliceKernel() : _starts_abs(), _final_strides(), _shrink_mask() { } -void NEStridedSliceKernel::configure(const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void NEStridedSliceKernel::configure(const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); _shrink_mask = shrink_axis_mask; const TensorShape &input_shape = input->tensor_shape(); Coordinates ends_abs; - std::tie(_starts_abs, ends_abs, _final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords( - input_shape, - starts, ends, strides, - begin_mask, end_mask, shrink_axis_mask); + std::tie(_starts_abs, ends_abs, _final_strides) = + arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(input_shape, starts, ends, strides, + begin_mask, end_mask, shrink_axis_mask); // Configure kernel window - auto win_config = validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + auto win_config = + validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); INEKernel::configure(win_config.second); } -Status NEStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status NEStridedSliceKernel::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), - starts, ends, strides, begin_mask, end_mask, shrink_axis_mask) - .first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), starts, ends, + strides, begin_mask, end_mask, shrink_axis_mask) + .first); return Status{}; } @@ -156,7 +174,7 @@ void NEStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, co size_t length_x = win.shape()[0]; - if(_final_strides[0] == 1 && !is_shrink_x) + if (_final_strides[0] == 1 && !is_shrink_x) { win.set(Window::DimX, Window::Dimension(0, 1, 1)); width_size = width_size * length_x; @@ -183,16 +201,17 @@ void NEStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, co uint8_t *cur_ptr; execute_window_loop( - win, [&](const Coordinates & id) - { - cur_ptr = input_base; - cur_ptr += (start_0 + (id[idx_x] * shrinked_stride_0)) * byte_increment_0; - cur_ptr += (start_1 + (id[idx_y] * shrinked_stride_1)) * byte_increment_1; - cur_ptr += (start_2 + (id[idx_z] * shrinked_stride_2)) * byte_increment_2; - cur_ptr += (start_3 + (id[idx_w] * shrinked_stride_3)) * byte_increment_3; - - std::copy_n(cur_ptr, width_size, output_it.ptr()); - }, - output_it); + win, + [&](const Coordinates &id) + { + cur_ptr = input_base; + cur_ptr += (start_0 + (id[idx_x] * shrinked_stride_0)) * byte_increment_0; + cur_ptr += (start_1 + (id[idx_y] * shrinked_stride_1)) * byte_increment_1; + cur_ptr += (start_2 + (id[idx_z] * shrinked_stride_2)) * byte_increment_2; + cur_ptr += (start_3 + (id[idx_w] * shrinked_stride_3)) * byte_increment_3; + + std::copy_n(cur_ptr, width_size, output_it.ptr()); + }, + output_it); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.h b/src/core/NEON/kernels/NEStridedSliceKernel.h index 9ce517417d..a475f09a17 100644 --- a/src/core/NEON/kernels/NEStridedSliceKernel.h +++ b/src/core/NEON/kernels/NEStridedSliceKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NE_STRIDED_SLICE_KERNEL_H #include "arm_compute/core/Types.h" + #include "src/core/NEON/INEKernel.h" #include @@ -68,9 +69,14 @@ public: * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1. * A slice of size 1 starting from starts[i] in the dimension must be preserved. */ - void configure(const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask); + void configure(const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask); /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSliceKernel * @@ -86,9 +92,14 @@ public: * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1. * A slice of size 1 starting from starts[i] in the dimension must be preserved. */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override; diff --git a/src/core/NEON/kernels/NETileKernel.cpp b/src/core/NEON/kernels/NETileKernel.cpp index 94256dc12d..577ce5b69e 100644 --- a/src/core/NEON/kernels/NETileKernel.cpp +++ b/src/core/NEON/kernels/NETileKernel.cpp @@ -27,9 +27,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -43,15 +44,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4); ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty()); - ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) - { - return e == 0; - })); + ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) { return e == 0; })); // Validate output if initialized - if(output->total_size() != 0) + if (output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } @@ -59,8 +58,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c } } // namespace -NETileKernel::NETileKernel() - : _input(nullptr), _output(nullptr) +NETileKernel::NETileKernel() : _input(nullptr), _output(nullptr) { } @@ -95,8 +93,9 @@ void NETileKernel::run(const Window &window, const ThreadInfo &info) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - Window output_window{ window }; - output_window.set(Window::DimX, Window::Dimension(output_window.x().start(), output_window.x().end(), _input->info()->dimension(0))); + Window output_window{window}; + output_window.set(Window::DimX, Window::Dimension(output_window.x().start(), output_window.x().end(), + _input->info()->dimension(0))); Window out_slice = output_window.first_slice_window_1D(); const auto src_shape = _input->info()->tensor_shape(); @@ -104,17 +103,19 @@ void NETileKernel::run(const Window &window, const ThreadInfo &info) { Iterator output_it(_output, out_slice); - execute_window_loop(out_slice, [&](const Coordinates & id) - { - const size_t x = id.x(); - const size_t y = id.y(); - const size_t z = id.z(); - const size_t w = id[3]; - Coordinates input_coords{ x % src_shape[0], y % src_shape[1], z % src_shape[2], w % src_shape[3] }; - memcpy(output_it.ptr(), _input->ptr_to_element(input_coords), _input->info()->dimension(0) * _input->info()->element_size()); - }, - output_it); - } - while(output_window.slide_window_slice_1D(out_slice)); + execute_window_loop( + out_slice, + [&](const Coordinates &id) + { + const size_t x = id.x(); + const size_t y = id.y(); + const size_t z = id.z(); + const size_t w = id[3]; + Coordinates input_coords{x % src_shape[0], y % src_shape[1], z % src_shape[2], w % src_shape[3]}; + memcpy(output_it.ptr(), _input->ptr_to_element(input_coords), + _input->info()->dimension(0) * _input->info()->element_size()); + }, + output_it); + } while (output_window.slide_window_slice_1D(out_slice)); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp index dbd47ccfa9..13c2d314e4 100644 --- a/src/core/NEON/kernels/assembly/depthwise.hpp +++ b/src/core/NEON/kernels/assembly/depthwise.hpp @@ -38,9 +38,8 @@ struct DepthwiseConfig DepthwiseMethod method = DepthwiseMethod::DEFAULT; std::string filter = ""; - DepthwiseConfig(DepthwiseMethod method) - : method(method) {}; - DepthwiseConfig() {}; + DepthwiseConfig(DepthwiseMethod method) : method(method){}; + DepthwiseConfig(){}; }; struct DepthwiseArgs @@ -63,18 +62,24 @@ struct DepthwiseArgs bool fast_mode = false; - DepthwiseArgs( - const CPUInfo *cpu_info, - unsigned int kernel_rows, unsigned int kernel_cols, - unsigned int stride_rows, unsigned int stride_cols, - unsigned int dilation_rows, unsigned int dilation_cols, - unsigned int n_batches, unsigned int input_rows, unsigned int input_cols, - unsigned int input_channels, - unsigned int output_rows, unsigned int output_cols, - unsigned int channel_multiplier, - PaddingValues padding, arm_gemm::Activation activation, - - const DepthwiseConfig *config) + DepthwiseArgs(const CPUInfo *cpu_info, + unsigned int kernel_rows, + unsigned int kernel_cols, + unsigned int stride_rows, + unsigned int stride_cols, + unsigned int dilation_rows, + unsigned int dilation_cols, + unsigned int n_batches, + unsigned int input_rows, + unsigned int input_cols, + unsigned int input_channels, + unsigned int output_rows, + unsigned int output_cols, + unsigned int channel_multiplier, + PaddingValues padding, + arm_gemm::Activation activation, + + const DepthwiseConfig *config) : cpu_info(cpu_info), kernel_rows(kernel_rows), kernel_cols(kernel_cols), @@ -95,20 +100,38 @@ struct DepthwiseArgs { } - DepthwiseArgs( - const CPUInfo *cpu_info, - unsigned int kernel_rows, unsigned int kernel_cols, - unsigned int stride_rows, unsigned int stride_cols, - unsigned int n_batches, unsigned int input_rows, unsigned int input_cols, - unsigned int input_channels, - unsigned int output_rows, unsigned int output_cols, - unsigned int channel_multiplier, - PaddingValues padding, arm_gemm::Activation activation, - const DepthwiseConfig *config) - : DepthwiseArgs(cpu_info, kernel_rows, kernel_cols, stride_rows, - stride_cols, 1, 1, n_batches, input_rows, input_cols, - input_channels, output_rows, output_cols, - channel_multiplier, padding, activation, config) + DepthwiseArgs(const CPUInfo *cpu_info, + unsigned int kernel_rows, + unsigned int kernel_cols, + unsigned int stride_rows, + unsigned int stride_cols, + unsigned int n_batches, + unsigned int input_rows, + unsigned int input_cols, + unsigned int input_channels, + unsigned int output_rows, + unsigned int output_cols, + unsigned int channel_multiplier, + PaddingValues padding, + arm_gemm::Activation activation, + const DepthwiseConfig *config) + : DepthwiseArgs(cpu_info, + kernel_rows, + kernel_cols, + stride_rows, + stride_cols, + 1, + 1, + n_batches, + input_rows, + input_cols, + input_channels, + output_rows, + output_cols, + channel_multiplier, + padding, + activation, + config) { } }; @@ -127,17 +150,18 @@ struct Tile { } - Tile() - : Tile(nullptr, 0, 0, 0) + Tile() : Tile(nullptr, 0, 0, 0) { } - void load_from( - const TInput *input, - const unsigned int ld_row, const unsigned int ld_col, - const unsigned int n_rows, const unsigned int n_cols, - const int input_i, const int input_j, - const unsigned int channel_multiplier) const + void load_from(const TInput *input, + const unsigned int ld_row, + const unsigned int ld_col, + const unsigned int n_rows, + const unsigned int n_cols, + const int input_i, + const int input_j, + const unsigned int channel_multiplier) const { const auto pad_top = input_i < 0 ? -input_i : 0; const auto pad_left = input_j < 0 ? -input_j : 0; @@ -145,18 +169,15 @@ struct Tile const auto padded_rows = std::min(n_rows - input_i, tile_rows) - pad_top; const auto padded_cols = std::min(n_cols - input_j, tile_cols) - pad_left; - if(padded_rows < tile_rows || padded_cols < tile_cols) + if (padded_rows < tile_rows || padded_cols < tile_cols) { memset(array, 0, tile_rows * tile_cols * tile_channels * sizeof(TInput)); } - do_premultiply( - (TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col, - ld_row, ld_col, - array + pad_top * tile_cols * tile_channels + pad_left * tile_channels, - tile_cols * tile_channels, tile_channels, - padded_rows, padded_cols, tile_channels / channel_multiplier, - channel_multiplier); + do_premultiply((TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col, ld_row, + ld_col, array + pad_top * tile_cols * tile_channels + pad_left * tile_channels, + tile_cols * tile_channels, tile_channels, padded_rows, padded_cols, + tile_channels / channel_multiplier, channel_multiplier); } }; @@ -168,9 +189,8 @@ protected: std::string m_name{}; public: - DepthwiseCommon(const DepthwiseArgs &args) - : m_args(args) {}; - DepthwiseCommon(DepthwiseCommon &) = delete; + DepthwiseCommon(const DepthwiseArgs &args) : m_args(args){}; + DepthwiseCommon(DepthwiseCommon &) = delete; DepthwiseCommon &operator=(DepthwiseCommon &) = delete; std::string name() const override @@ -181,19 +201,18 @@ public: void set_name(std::string name) { // Only allow the name to be set once - if(m_name.empty()) + if (m_name.empty()) { m_name = name; } } - void execute( - const void *const input, - const void *const parameters, - void *const output, - void *const working_space, - const unsigned int thread_id, - const unsigned int n_threads) const override final + void execute(const void *const input, + const void *const parameters, + void *const output, + void *const working_space, + const unsigned int thread_id, + const unsigned int n_threads) const override final { const size_t ld_input_col = m_args.input_channels; const size_t ld_input_row = ld_input_col * m_args.input_cols; @@ -202,56 +221,47 @@ public: const size_t ld_output_row = ld_output_col * m_args.output_cols; const size_t ld_output_batch = ld_output_row * m_args.output_rows; - execute( - input, ld_input_col, ld_input_row, ld_input_batch, - parameters, output, ld_output_col, ld_output_row, ld_output_batch, - working_space, thread_id, n_threads); + execute(input, ld_input_col, ld_input_row, ld_input_batch, parameters, output, ld_output_col, ld_output_row, + ld_output_batch, working_space, thread_id, n_threads); } - void execute( - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const void *const parameters, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *const working_space, - const unsigned int thread_id, - const unsigned int n_threads) const override final + void execute(const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *const parameters, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *const working_space, + const unsigned int thread_id, + const unsigned int n_threads) const override final { - execute( - m_args.n_batches, m_args.input_rows, m_args.input_cols, - m_args.input_channels, m_args.padding, - input, ld_input_col, ld_input_row, ld_input_batch, - parameters, - m_args.output_rows, m_args.output_cols, - output, ld_output_col, ld_output_row, ld_output_batch, - working_space, thread_id, n_threads); + execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.input_channels, m_args.padding, input, + ld_input_col, ld_input_row, ld_input_batch, parameters, m_args.output_rows, m_args.output_cols, output, + ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, n_threads); } - void execute( - unsigned int batches, - unsigned int input_height, - unsigned int input_width, - unsigned int channels, - const PaddingValues &padding, - const void *input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const void *parameters, - unsigned int output_height, - unsigned int output_width, - void *output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int n_threads) const override final + void execute(unsigned int batches, + unsigned int input_height, + unsigned int input_width, + unsigned int channels, + const PaddingValues &padding, + const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + unsigned int output_height, + unsigned int output_width, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const override final { // Construct a new set of arguments to reflect that we might have been // passed different input/output tensors. Dilation is handled at this @@ -271,38 +281,33 @@ public: auto ld_output_col_d = ld_output_col * m_args.dilation_cols; auto ld_output_row_d = ld_output_row * m_args.dilation_rows; - for(size_t drow = 0; drow < m_args.dilation_rows; drow++) + for (size_t drow = 0; drow < m_args.dilation_rows; drow++) { size_t start_i; - std::tie(args.output_rows, args.input_rows, start_i, - args.padding.top, args.padding.bottom) = - get_reduced_view_for_dilation( - output_height, input_height, drow, m_args.dilation_rows, - m_args.kernel_rows, m_args.stride_rows, padding.top); + std::tie(args.output_rows, args.input_rows, start_i, args.padding.top, args.padding.bottom) = + get_reduced_view_for_dilation(output_height, input_height, drow, m_args.dilation_rows, + m_args.kernel_rows, m_args.stride_rows, padding.top); auto input_row = static_cast(input) + start_i * ld_input_row; auto output_row = static_cast(output) + drow * ld_output_row; - if(args.output_rows) + if (args.output_rows) { - for(size_t dcol = 0; dcol < m_args.dilation_cols; dcol++) + for (size_t dcol = 0; dcol < m_args.dilation_cols; dcol++) { size_t start_j; - std::tie(args.output_cols, args.input_cols, start_j, - args.padding.left, args.padding.right) = - get_reduced_view_for_dilation( - output_width, input_width, dcol, m_args.dilation_cols, - m_args.kernel_cols, m_args.stride_cols, padding.left); + std::tie(args.output_cols, args.input_cols, start_j, args.padding.left, args.padding.right) = + get_reduced_view_for_dilation(output_width, input_width, dcol, m_args.dilation_cols, + m_args.kernel_cols, m_args.stride_cols, padding.left); const TInput *input_col = input_row + start_j * ld_input_col; TOutput *output_col = output_row + dcol * ld_output_col; - if(args.output_cols) + if (args.output_cols) { - this->execute_internal( - args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch, parameters, - output_col, ld_output_col_d, ld_output_row_d, ld_output_batch, - working_space, thread_id, n_threads); + this->execute_internal(args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch, + parameters, output_col, ld_output_col_d, ld_output_row_d, + ld_output_batch, working_space, thread_id, n_threads); } } } @@ -310,20 +315,19 @@ public: } protected: - virtual void execute_internal( - const DepthwiseArgs &instance_args, - const void *input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const void *parameters, - void *output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int n_threads) const = 0; + virtual void execute_internal(const DepthwiseArgs &instance_args, + const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; virtual bool uses_premultiply() const { diff --git a/src/core/NEON/kernels/assembly/depthwise_common.hpp b/src/core/NEON/kernels/assembly/depthwise_common.hpp index a5db793b3d..5ff848e281 100644 --- a/src/core/NEON/kernels/assembly/depthwise_common.hpp +++ b/src/core/NEON/kernels/assembly/depthwise_common.hpp @@ -49,11 +49,7 @@ struct KernelDescription bool is_default = false; uint64_t cycle_estimate = 0; - KernelDescription( - DepthwiseMethod method, - std::string name, - bool is_default, - uint64_t cycle_estimate) + KernelDescription(DepthwiseMethod method, std::string name, bool is_default, uint64_t cycle_estimate) : method(method), name(name), is_default(is_default), cycle_estimate(cycle_estimate) { } @@ -78,58 +74,51 @@ public: // pointer the bias vector (which may be nullptr in the case of no bias) and // a pointer to the array of weights (stored in HWIO order). virtual void pack_parameters( - void *buffer, - const void *biases, - const void *weights, - size_t ld_weight_col = 0, - size_t ld_weight_row = 0) = 0; + void *buffer, const void *biases, const void *weights, size_t ld_weight_col = 0, size_t ld_weight_row = 0) = 0; // Determine the amount of working space required virtual size_t get_working_size(unsigned int n_threads) const = 0; // Execute the convolution over the specified area of memory. - virtual void execute( - const void *input, // Pointer to input tensor - const void *parameters, // Packed parameters buffer - void *output, - void *working_space, - unsigned int thread_id, - unsigned int n_threads) const = 0; - - virtual void execute( - const void *input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const void *parameters, - void *output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int n_threads) const = 0; - - virtual void execute( - unsigned int batches, - unsigned int input_height, - unsigned int input_width, - unsigned int channels, - const PaddingValues &, - const void *input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const void *parameters, - unsigned int output_height, - unsigned int output_width, - void *output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int n_threads) const = 0; + virtual void execute(const void *input, // Pointer to input tensor + const void *parameters, // Packed parameters buffer + void *output, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; + + virtual void execute(const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; + + virtual void execute(unsigned int batches, + unsigned int input_height, + unsigned int input_width, + unsigned int channels, + const PaddingValues &, + const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + unsigned int output_height, + unsigned int output_width, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; }; // To handle a dilation factor of D execute the kernel once for each d in @@ -145,12 +134,13 @@ public: // - Number of valid input pixels corresponding to `d` // - Offset of the first pixel corresponding to `d` // - Amount of padding in the view for `d` -std::tuple -get_reduced_view_for_dilation( - size_t out_size, size_t in_size, - size_t d, size_t dilation_factor, - size_t kernel_size, size_t stride, - size_t pad_before); +std::tuple get_reduced_view_for_dilation(size_t out_size, + size_t in_size, + size_t d, + size_t dilation_factor, + size_t kernel_size, + size_t stride, + size_t pad_before); } // namespace depthwise } // namespace arm_conv diff --git a/src/core/NEON/kernels/assembly/pool_common.hpp b/src/core/NEON/kernels/assembly/pool_common.hpp index f1f70cf1d6..045f9f95d3 100644 --- a/src/core/NEON/kernels/assembly/pool_common.hpp +++ b/src/core/NEON/kernels/assembly/pool_common.hpp @@ -68,45 +68,42 @@ public: virtual size_t get_working_size(unsigned int num_threads) const = 0; // Execute pooling over the specified area of memory. - virtual void execute( - const void *const input, - void *const output, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const = 0; + virtual void execute(const void *const input, + void *const output, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const = 0; - virtual void execute( - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const = 0; + virtual void execute(const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const = 0; - virtual void execute( - unsigned int batches, - unsigned int height, - unsigned int width, - unsigned int channels, - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const PaddingValues &, - unsigned int output_height, - unsigned int output_width, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const = 0; + virtual void execute(unsigned int batches, + unsigned int height, + unsigned int width, + unsigned int channels, + const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const PaddingValues &, + unsigned int output_height, + unsigned int output_width, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const = 0; }; } // namespace pooling diff --git a/src/core/NEON/kernels/assembly/pooling.hpp b/src/core/NEON/kernels/assembly/pooling.hpp index e8db35c593..89d594298e 100644 --- a/src/core/NEON/kernels/assembly/pooling.hpp +++ b/src/core/NEON/kernels/assembly/pooling.hpp @@ -36,9 +36,8 @@ struct PoolingConfig PoolingMethod method = PoolingMethod::DEFAULT; std::string filter = ""; - PoolingConfig(PoolingMethod method) - : method(method) {}; - PoolingConfig() {}; + PoolingConfig(PoolingMethod method) : method(method){}; + PoolingConfig(){}; }; struct PoolingArgs @@ -57,30 +56,40 @@ struct PoolingArgs const PoolingConfig *config; - PoolingArgs( - const CPUInfo *cpu_info, - PoolingType pool_type, - const PoolingWindow &window, - const PoolingStride &stride, - bool exclude_padding, - unsigned int n_batches, - unsigned int input_rows, - unsigned int input_cols, - unsigned int n_channels, - unsigned int output_rows, - unsigned int output_cols, - const PaddingValues &padding, - const PoolingConfig *cfg) - : cpu_info(cpu_info), pool_type(pool_type), pool_window(window), pool_stride(stride), exclude_padding(exclude_padding), n_batches(n_batches), input_rows(input_rows), input_cols(input_cols), - n_channels(n_channels), output_rows(output_rows), output_cols(output_cols), padding(padding), config(cfg) + PoolingArgs(const CPUInfo *cpu_info, + PoolingType pool_type, + const PoolingWindow &window, + const PoolingStride &stride, + bool exclude_padding, + unsigned int n_batches, + unsigned int input_rows, + unsigned int input_cols, + unsigned int n_channels, + unsigned int output_rows, + unsigned int output_cols, + const PaddingValues &padding, + const PoolingConfig *cfg) + : cpu_info(cpu_info), + pool_type(pool_type), + pool_window(window), + pool_stride(stride), + exclude_padding(exclude_padding), + n_batches(n_batches), + input_rows(input_rows), + input_cols(input_cols), + n_channels(n_channels), + output_rows(output_rows), + output_cols(output_cols), + padding(padding), + config(cfg) { // If either of the pooling window dimensions are set to zero, meaning // "pool everything", then replace with the corresponding input dimension. - if(pool_window.rows == 0) + if (pool_window.rows == 0) { pool_window.rows = input_rows; } - if(pool_window.cols == 0) + if (pool_window.cols == 0) { pool_window.cols = input_cols; } @@ -100,10 +109,16 @@ struct Requantize32 int32_t per_layer_right_shift = 0; int32_t per_layer_mul = 0; - Requantize32(int32_t input_offset, int32_t output_offset, - int32_t per_layer_left_shift, int32_t per_layer_right_shift, + Requantize32(int32_t input_offset, + int32_t output_offset, + int32_t per_layer_left_shift, + int32_t per_layer_right_shift, int32_t per_layer_mul) - : input_offset(input_offset), output_offset(output_offset), per_layer_left_shift(per_layer_left_shift), per_layer_right_shift(per_layer_right_shift), per_layer_mul(per_layer_mul) + : input_offset(input_offset), + output_offset(output_offset), + per_layer_left_shift(per_layer_left_shift), + per_layer_right_shift(per_layer_right_shift), + per_layer_mul(per_layer_mul) { } }; @@ -115,105 +130,88 @@ protected: const PoolingArgs m_args; public: - PoolingCommon(const PoolingArgs &args) - : m_args(args) + PoolingCommon(const PoolingArgs &args) : m_args(args) { } - PoolingCommon(PoolingCommon &) = delete; + PoolingCommon(PoolingCommon &) = delete; PoolingCommon &operator=(PoolingCommon &) = delete; size_t get_working_size(unsigned int) const override = 0; // Execute pooling over the specified area of memory. - void execute( - const void *const input, - void *const output, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const override + void execute(const void *const input, + void *const output, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const override { - this->execute( - input, - m_args.n_channels, - m_args.n_channels * m_args.input_cols, - m_args.n_channels * m_args.input_cols * m_args.input_rows, - output, - m_args.n_channels, - m_args.n_channels * m_args.output_cols, - m_args.n_channels * m_args.output_cols * m_args.output_rows, - working_space, - thread_id, num_threads); + this->execute(input, m_args.n_channels, m_args.n_channels * m_args.input_cols, + m_args.n_channels * m_args.input_cols * m_args.input_rows, output, m_args.n_channels, + m_args.n_channels * m_args.output_cols, + m_args.n_channels * m_args.output_cols * m_args.output_rows, working_space, thread_id, + num_threads); } - void execute( - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const override + void execute(const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const override { - this->execute( - m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels, - input, ld_input_col, ld_input_row, ld_input_batch, - m_args.padding, m_args.output_rows, m_args.output_cols, - output, ld_output_col, ld_output_row, ld_output_batch, - working_space, thread_id, num_threads); + this->execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels, input, ld_input_col, + ld_input_row, ld_input_batch, m_args.padding, m_args.output_rows, m_args.output_cols, output, + ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, num_threads); } - void execute( - unsigned int batches, - unsigned int height, - unsigned int width, - unsigned int channels, - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - const PaddingValues &padding, - unsigned int output_height, - unsigned int output_width, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const override + void execute(unsigned int batches, + unsigned int height, + unsigned int width, + unsigned int channels, + const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const PaddingValues &padding, + unsigned int output_height, + unsigned int output_width, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const override { - this->execute_internal( - batches, height, width, channels, padding, - input, ld_input_col, ld_input_row, ld_input_batch, - output_height, output_width, - output, ld_output_col, ld_output_row, ld_output_batch, - working_space, thread_id, num_threads); + this->execute_internal(batches, height, width, channels, padding, input, ld_input_col, ld_input_row, + ld_input_batch, output_height, output_width, output, ld_output_col, ld_output_row, + ld_output_batch, working_space, thread_id, num_threads); } protected: - virtual void execute_internal( - unsigned int batches, - unsigned int height, - unsigned int width, - unsigned int channels, - const PaddingValues &, - const void *const input, - size_t ld_input_col, - size_t ld_input_row, - size_t ld_input_batch, - unsigned int output_height, - unsigned int output_width, - void *const output, - size_t ld_output_col, - size_t ld_output_row, - size_t ld_output_batch, - void *working_space, - unsigned int thread_id, - unsigned int num_threads) const = 0; + virtual void execute_internal(unsigned int batches, + unsigned int height, + unsigned int width, + unsigned int channels, + const PaddingValues &, + const void *const input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + unsigned int output_height, + unsigned int output_width, + void *const output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int num_threads) const = 0; }; template diff --git a/src/core/NEON/kernels/assembly/premultiply.hpp b/src/core/NEON/kernels/assembly/premultiply.hpp index 16f26de38a..fb97cf8baf 100644 --- a/src/core/NEON/kernels/assembly/premultiply.hpp +++ b/src/core/NEON/kernels/assembly/premultiply.hpp @@ -44,30 +44,27 @@ void do_premultiply(const T *in_ptr, const unsigned input_channels, const unsigned int channel_multiplier) { - if(sizeof(T) == 4 && channel_multiplier == 6) + if (sizeof(T) == 4 && channel_multiplier == 6) { - do_premultiply_float_6( - (const float *)in_ptr, ld_row, ld_col, - (float *)out_ptr, out_ld_row, out_ld_col, - tile_rows, tile_cols, - input_channels); + do_premultiply_float_6((const float *)in_ptr, ld_row, ld_col, (float *)out_ptr, out_ld_row, out_ld_col, + tile_rows, tile_cols, input_channels); } else { - for(unsigned int i = 0; i < tile_rows; i++) + for (unsigned int i = 0; i < tile_rows; i++) { const T *ip2 = in_ptr + i * ld_row; T *op2 = out_ptr + i * out_ld_row; - for(unsigned int j = 0; j < tile_cols; j++) + for (unsigned int j = 0; j < tile_cols; j++) { const T *ip = ip2; T *op = op2; - for(unsigned int c = 0; c < input_channels; c++) + for (unsigned int c = 0; c < input_channels; c++) { T val = *ip; ip++; - for(unsigned int r = 0; r < channel_multiplier; r++) + for (unsigned int r = 0; r < channel_multiplier; r++) { op[r] = val; } diff --git a/src/core/NEON/kernels/assembly/winograd.hpp b/src/core/NEON/kernels/assembly/winograd.hpp index 50290757ec..dbf95d23cd 100644 --- a/src/core/NEON/kernels/assembly/winograd.hpp +++ b/src/core/NEON/kernels/assembly/winograd.hpp @@ -45,17 +45,24 @@ struct ConvolutionArgs Shape2D kernel_shape; arm_gemm::Activation activation; - ConvolutionArgs( - unsigned int n_batches, - const Shape2D &input_shape, - unsigned int n_input_channels, - unsigned int pad_top, unsigned int pad_left, - const Shape2D &output_shape, - unsigned int n_output_channels, - const Shape2D kernel_shape, - const arm_gemm::Activation &activation = {}) - : n_batches(n_batches), input_shape(input_shape), n_input_channels(n_input_channels), pad_top(pad_top), pad_left(pad_left), output_shape(output_shape), n_output_channels(n_output_channels), - kernel_shape(kernel_shape), activation(activation) + ConvolutionArgs(unsigned int n_batches, + const Shape2D &input_shape, + unsigned int n_input_channels, + unsigned int pad_top, + unsigned int pad_left, + const Shape2D &output_shape, + unsigned int n_output_channels, + const Shape2D kernel_shape, + const arm_gemm::Activation &activation = {}) + : n_batches(n_batches), + input_shape(input_shape), + n_input_channels(n_input_channels), + pad_top(pad_top), + pad_left(pad_left), + output_shape(output_shape), + n_output_channels(n_output_channels), + kernel_shape(kernel_shape), + activation(activation) { } }; @@ -105,23 +112,30 @@ public: virtual unsigned int get_transformed_tile_rows(void) const = 0; virtual unsigned int get_transformed_tile_cols(void) const = 0; - void execute( - const ConvolutionArgs &args, - const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel, - void *outptr, const WinogradDomainSpec &wds, - unsigned int thread_id, unsigned int n_threads) const + void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_row, + size_t ld_in_col, + size_t ld_input_channel, + void *outptr, + const WinogradDomainSpec &wds, + unsigned int thread_id, + unsigned int n_threads) const { - this->execute( - args, inptr, ld_in_row, ld_in_col, ld_input_channel, - outptr, wds.weight_ld_matrix, wds.weight_ld_row, - thread_id, n_threads); + this->execute(args, inptr, ld_in_row, ld_in_col, ld_input_channel, outptr, wds.weight_ld_matrix, + wds.weight_ld_row, thread_id, n_threads); } - virtual void execute( - const ConvolutionArgs &args, - const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel, - void *outptr, size_t ld_out_matrix, size_t ld_out_row, - unsigned int thread_id, unsigned int n_threads) const = 0; + virtual void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_row, + size_t ld_in_col, + size_t ld_input_channel, + void *outptr, + size_t ld_out_matrix, + size_t ld_out_row, + unsigned int thread_id, + unsigned int n_threads) const = 0; }; } // namespace weight_transform @@ -136,27 +150,35 @@ public: virtual unsigned int get_input_rows(void) const = 0; virtual unsigned int get_input_cols(void) const = 0; - virtual size_t get_working_space_size( - const ConvolutionArgs &args, - unsigned int n_threads) const = 0; - - void execute( - const ConvolutionArgs &args, - const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col, - void *outptr, const WinogradDomainSpec &wds, - void *working_space, unsigned int thread_id, unsigned int n_threads) const + virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0; + + void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_batch, + size_t ld_in_row, + size_t ld_in_col, + void *outptr, + const WinogradDomainSpec &wds, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const { - this->execute( - args, inptr, ld_in_batch, ld_in_row, ld_in_col, - outptr, wds.input_ld_batch, wds.input_ld_matrix, wds.input_ld_row, - working_space, thread_id, n_threads); + this->execute(args, inptr, ld_in_batch, ld_in_row, ld_in_col, outptr, wds.input_ld_batch, wds.input_ld_matrix, + wds.input_ld_row, working_space, thread_id, n_threads); } - virtual void execute( - const ConvolutionArgs &args, - const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col, - void *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row, - void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0; + virtual void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_batch, + size_t ld_in_row, + size_t ld_in_col, + void *outptr, + size_t ld_out_batch, + size_t ld_out_matrix, + size_t ld_out_row, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; }; } // namespace input_transform @@ -177,31 +199,37 @@ public: virtual unsigned int get_kernel_rows(void) const = 0; virtual unsigned int get_kernel_cols(void) const = 0; - virtual size_t get_working_space_size( - const ConvolutionArgs &args, - unsigned int n_threads) const = 0; - - void execute( - const ConvolutionArgs &args, - const void *inptr, const WinogradDomainSpec &wds, - const void *bias, - void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col, - void *working_space, unsigned int thread_id, unsigned int n_threads) const + virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0; + + void execute(const ConvolutionArgs &args, + const void *inptr, + const WinogradDomainSpec &wds, + const void *bias, + void *outptr, + size_t ld_out_batch, + size_t ld_out_row, + size_t ld_out_col, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const { - this->execute( - args, - inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row, - bias, - outptr, ld_out_batch, ld_out_row, ld_out_col, - working_space, thread_id, n_threads); + this->execute(args, inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row, bias, outptr, + ld_out_batch, ld_out_row, ld_out_col, working_space, thread_id, n_threads); } - virtual void execute( - const ConvolutionArgs &args, - const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row, - const void *bias, - void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col, - void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0; + virtual void execute(const ConvolutionArgs &args, + const void *inptr, + size_t ld_in_batch, + size_t ld_in_matrix, + size_t ld_in_row, + const void *bias, + void *outptr, + size_t ld_out_batch, + size_t ld_out_row, + size_t ld_out_col, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const = 0; }; } // namespace output_transform @@ -210,7 +238,7 @@ struct WinogradImpl { const output_transform::ITransform *output_transform = nullptr; const weight_transform::ITransform *weight_transform = nullptr; - const input_transform::ITransform *input_transform = nullptr; + const input_transform::ITransform *input_transform = nullptr; std::unique_ptr gemm_args; WinogradDomainSpec winograd_spec; }; @@ -220,15 +248,18 @@ struct WinogradImpl * Assigns to the pointers in the `dest` struct and returns true or false to * indicate whether the given problem can be executed or not. */ -template -bool get_implementation( - WinogradImpl &dest, // Destination for the selected implementation - const CPUInfo *, - const ConvolutionArgs &, - int max_threads, - bool fast_mode, - const WinogradConfig *, - const arm_gemm::GemmConfig *); +template +bool get_implementation(WinogradImpl &dest, // Destination for the selected implementation + const CPUInfo *, + const ConvolutionArgs &, + int max_threads, + bool fast_mode, + const WinogradConfig *, + const arm_gemm::GemmConfig *); } // namespace winograd } // namespace arm_conv diff --git a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp index ed5254a0a4..e3d9b670b3 100644 --- a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp +++ b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp @@ -24,8 +24,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/NEMath.h" + #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h" +#include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" #include @@ -37,12 +38,26 @@ namespace arm_compute { namespace { -using BatchNomalizationPtr = void (*)(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window); +using BatchNomalizationPtr = void (*)(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window); template -void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window) +void batch_normalization(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window) { /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; @@ -57,86 +72,99 @@ void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - const auto input_mean = reinterpret_cast(mean->ptr_to_element(Coordinates(0, 0))); - const auto input_var = reinterpret_cast(var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (gamma != nullptr) ? reinterpret_cast(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (beta != nullptr) ? reinterpret_cast(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_mean = reinterpret_cast(mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast(var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = + (gamma != nullptr) ? reinterpret_cast(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_beta = + (beta != nullptr) ? reinterpret_cast(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; T activation_functor(act_info); const auto epsilon_vec = wrapper::vdup_n(static_cast(epsilon), ExactTagType{}); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Perform core calculations using vector operations - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - // Conctruct vectors - const auto mean_vec = wrapper::vloadq(input_mean + x); - const auto var_vec = wrapper::vloadq(input_var + x); - const auto gamma_vec = (input_gamma != nullptr) ? wrapper::vloadq(input_gamma + x) : wrapper::vdup_n(static_cast(1.f), ExactTagType{}); - const auto beta_vec = (input_beta != nullptr) ? wrapper::vloadq(input_beta + x) : wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - - // Calculate denominator - const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); - - // Calculate x bar - const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec); - const auto x_bar = wrapper::vmul(numerator, denominator); - auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec); - - // Perform fused activation - if(act_info.enabled()) + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + // Perform core calculations using vector operations + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - activation_functor(res); + // Conctruct vectors + const auto mean_vec = wrapper::vloadq(input_mean + x); + const auto var_vec = wrapper::vloadq(input_var + x); + const auto gamma_vec = (input_gamma != nullptr) + ? wrapper::vloadq(input_gamma + x) + : wrapper::vdup_n(static_cast(1.f), ExactTagType{}); + const auto beta_vec = (input_beta != nullptr) + ? wrapper::vloadq(input_beta + x) + : wrapper::vdup_n(static_cast(0.f), ExactTagType{}); + + // Calculate denominator + const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); + + // Calculate x bar + const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec); + const auto x_bar = wrapper::vmul(numerator, denominator); + auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec); + + // Perform fused activation + if (act_info.enabled()) + { + activation_functor(res); + } + + // Store results + wrapper::vstore(output_ptr + x, res); } - // Store results - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - // Conctruct vectors - const float16_t gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f; - const float16_t beta = (input_beta != nullptr) ? input_beta[x] : 0.f; - - const float16_t denominator = sqrt(input_var[x] + epsilon); - const float16_t numerator = input_ptr[x] - input_mean[x]; - const float16_t x_bar = numerator / denominator; - float16_t res = beta + x_bar * gamma; - - // Perform fused activation - if(act_info.enabled()) + // Compute left-over elements + for (; x < window_end_x; ++x) { - activation_functor(res); + // Conctruct vectors + const float16_t gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f; + const float16_t beta = (input_beta != nullptr) ? input_beta[x] : 0.f; + + const float16_t denominator = sqrt(input_var[x] + epsilon); + const float16_t numerator = input_ptr[x] - input_mean[x]; + const float16_t x_bar = numerator / denominator; + float16_t res = beta + x_bar * gamma; + + // Perform fused activation + if (act_info.enabled()) + { + activation_functor(res); + } + + // Store results + *reinterpret_cast(output_ptr + x) = res; } - - // Store results - *reinterpret_cast(output_ptr + x) = res; - } - }, - input, output); + }, + input, output); } // Fused Batched Normalization with activation functions -static std::map fused_map = -{ - { ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization> }, - { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization> }, - { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization> } -}; -} +static std::map fused_map = { + {ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization>}, + {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization>}, + {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization>}}; +} // namespace namespace cpu { -void fp16_neon_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window) +void fp16_neon_batch_normalization(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window) { - if(act_info.enabled()) + if (act_info.enabled()) { fused_map[act_info.activation()](src, dst, mean, var, beta, gamma, epsilon, act_info, window); } diff --git a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp index d6e22e1843..4e1654ee6b 100644 --- a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp +++ b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp @@ -24,8 +24,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" -#include "src/core/NEON/NEMath.h" + #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h" +#include "src/core/NEON/NEMath.h" #include "src/core/NEON/wrapper/wrapper.h" #include @@ -36,12 +37,26 @@ namespace arm_compute { namespace { -using BatchNomalizationPtr = void (*)(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window); +using BatchNomalizationPtr = void (*)(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window); template -void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window) +void batch_normalization(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window) { /** SIMD vector tag type. */ using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t; @@ -56,86 +71,99 @@ void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - const auto input_mean = reinterpret_cast(mean->ptr_to_element(Coordinates(0, 0))); - const auto input_var = reinterpret_cast(var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (gamma != nullptr) ? reinterpret_cast(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (beta != nullptr) ? reinterpret_cast(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_mean = reinterpret_cast(mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast(var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = + (gamma != nullptr) ? reinterpret_cast(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_beta = + (beta != nullptr) ? reinterpret_cast(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; T activation_functor(act_info); const auto epsilon_vec = wrapper::vdup_n(static_cast(epsilon), ExactTagType{}); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Perform core calculations using vector operations - int x = window_start_x; - for(; x <= (window_end_x - window_step_x); x += window_step_x) + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - // Conctruct vectors - const auto mean_vec = wrapper::vloadq(input_mean + x); - const auto var_vec = wrapper::vloadq(input_var + x); - const auto gamma_vec = (input_gamma != nullptr) ? wrapper::vloadq(input_gamma + x) : wrapper::vdup_n(static_cast(1.f), ExactTagType{}); - const auto beta_vec = (input_beta != nullptr) ? wrapper::vloadq(input_beta + x) : wrapper::vdup_n(static_cast(0.f), ExactTagType{}); - - // Calculate denominator - const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); - - // Calculate x bar - const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec); - const auto x_bar = wrapper::vmul(numerator, denominator); - auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec); - - // Perform fused activation - if(act_info.enabled()) + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + // Perform core calculations using vector operations + int x = window_start_x; + for (; x <= (window_end_x - window_step_x); x += window_step_x) { - activation_functor(res); + // Conctruct vectors + const auto mean_vec = wrapper::vloadq(input_mean + x); + const auto var_vec = wrapper::vloadq(input_var + x); + const auto gamma_vec = (input_gamma != nullptr) + ? wrapper::vloadq(input_gamma + x) + : wrapper::vdup_n(static_cast(1.f), ExactTagType{}); + const auto beta_vec = (input_beta != nullptr) + ? wrapper::vloadq(input_beta + x) + : wrapper::vdup_n(static_cast(0.f), ExactTagType{}); + + // Calculate denominator + const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)); + + // Calculate x bar + const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec); + const auto x_bar = wrapper::vmul(numerator, denominator); + auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec); + + // Perform fused activation + if (act_info.enabled()) + { + activation_functor(res); + } + + // Store results + wrapper::vstore(output_ptr + x, res); } - // Store results - wrapper::vstore(output_ptr + x, res); - } - - // Compute left-over elements - for(; x < window_end_x; ++x) - { - // Conctruct vectors - const float gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f; - const float beta = (input_beta != nullptr) ? input_beta[x] : 0.f; - - const float denominator = sqrt(input_var[x] + epsilon); - const float numerator = input_ptr[x] - input_mean[x]; - const float x_bar = numerator / denominator; - float res = beta + x_bar * gamma; - - // Perform fused activation - if(act_info.enabled()) + // Compute left-over elements + for (; x < window_end_x; ++x) { - activation_functor(res); + // Conctruct vectors + const float gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f; + const float beta = (input_beta != nullptr) ? input_beta[x] : 0.f; + + const float denominator = sqrt(input_var[x] + epsilon); + const float numerator = input_ptr[x] - input_mean[x]; + const float x_bar = numerator / denominator; + float res = beta + x_bar * gamma; + + // Perform fused activation + if (act_info.enabled()) + { + activation_functor(res); + } + + // Store results + *reinterpret_cast(output_ptr + x) = res; } - - // Store results - *reinterpret_cast(output_ptr + x) = res; - } - }, - input, output); + }, + input, output); } // Fused Batched Normalization with activation functions -static std::map fused_map = -{ - { ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization> }, - { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization> }, - { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization> } -}; -} +static std::map fused_map = { + {ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization>}, + {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization>}, + {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization>}}; +} // namespace namespace cpu { -void fp32_neon_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window) +void fp32_neon_batch_normalization(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window) { - if(act_info.enabled()) + if (act_info.enabled()) { fused_map[act_info.activation()](src, dst, mean, var, beta, gamma, epsilon, act_info, window); } diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp index 98cd9aa7fe..48caaa3e63 100644 --- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp +++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/NEON/SVEMath.h" #include @@ -37,8 +38,15 @@ namespace arm_compute { namespace cpu { -void fp16_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window) +void fp16_sve_batch_normalization(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window) { const auto window_start_x = static_cast(window.x().start()); const auto window_end_x = static_cast(window.x().end()); @@ -49,69 +57,74 @@ void fp16_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mea Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - const auto input_mean = reinterpret_cast(mean->ptr_to_element(Coordinates(0, 0))); - const auto input_var = reinterpret_cast(var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (gamma != nullptr) ? reinterpret_cast(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (beta != nullptr) ? reinterpret_cast(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_mean = reinterpret_cast(mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast(var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = + (gamma != nullptr) ? reinterpret_cast(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_beta = + (beta != nullptr) ? reinterpret_cast(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; const auto epsilon_vec = svdup_n_f16(epsilon); const auto const_1 = svdup_n_f16(1.f); const auto const_0 = svdup_n_f16(0.f); const auto va = svdup_n_f16(act_info.a()); const auto vb = svdup_n_f16(act_info.b()); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b16(x, window_end_x); - do + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - // Conctruct vectors - const auto mean_vec = svld1_f16(pg, input_mean + x); - const auto var_vec = svld1_f16(pg, input_var + x); - const auto gamma_vec = (input_gamma != nullptr) ? svld1_f16(pg, input_gamma + x) : const_1; - const auto beta_vec = (input_beta != nullptr) ? svld1_f16(pg, input_beta + x) : const_0; + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - // Calculate denominator - const auto tmp = svadd_f16_z(pg, var_vec, epsilon_vec); - auto denominator = svrsqrte_f16(tmp); - denominator = svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator); - denominator = svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator); + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b16(x, window_end_x); + do + { + // Conctruct vectors + const auto mean_vec = svld1_f16(pg, input_mean + x); + const auto var_vec = svld1_f16(pg, input_var + x); + const auto gamma_vec = (input_gamma != nullptr) ? svld1_f16(pg, input_gamma + x) : const_1; + const auto beta_vec = (input_beta != nullptr) ? svld1_f16(pg, input_beta + x) : const_0; - // Calculate x bar - const auto numerator = svsub_f16_z(pg, svld1_f16(pg, input_ptr + x), mean_vec); - const auto x_bar = svmul_f16_z(pg, numerator, denominator); - auto res = svmla_f16_z(pg, beta_vec, x_bar, gamma_vec); + // Calculate denominator + const auto tmp = svadd_f16_z(pg, var_vec, epsilon_vec); + auto denominator = svrsqrte_f16(tmp); + denominator = + svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator); + denominator = + svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator); - // Perform fused activation - if(act_info.enabled()) - { - if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) - { - res = svmax_f16_z(pg, const_0, res); - } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) - { - res = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, res)); - } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + // Calculate x bar + const auto numerator = svsub_f16_z(pg, svld1_f16(pg, input_ptr + x), mean_vec); + const auto x_bar = svmul_f16_z(pg, numerator, denominator); + auto res = svmla_f16_z(pg, beta_vec, x_bar, gamma_vec); + + // Perform fused activation + if (act_info.enabled()) { - res = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, res)); + if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) + { + res = svmax_f16_z(pg, const_0, res); + } + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + res = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, res)); + } + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + res = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, res)); + } } - } - // Store results - svst1_f16(pg, output_ptr + x, res); + // Store results + svst1_f16(pg, output_ptr + x, res); - x += svcntw(); - pg = svwhilelt_b16(x, window_end_x); - } - while(svptest_any(svptrue_b16(), pg)); - }, - input, output); + x += svcntw(); + pg = svwhilelt_b16(x, window_end_x); + } while (svptest_any(svptrue_b16(), pg)); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp index 952ab320bf..df4fbfe607 100644 --- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp +++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Window.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/NEON/SVEMath.h" #include @@ -37,8 +38,15 @@ namespace arm_compute { namespace cpu { -void fp32_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, - float epsilon, ActivationLayerInfo &act_info, const Window &window) +void fp32_sve_batch_normalization(ITensor *src, + ITensor *dst, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, + ActivationLayerInfo &act_info, + const Window &window) { const auto window_start_x = static_cast(window.x().start()); const auto window_end_x = static_cast(window.x().end()); @@ -49,69 +57,74 @@ void fp32_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mea Iterator input(src, win_collapsed); Iterator output(dst, win_collapsed); - const auto input_mean = reinterpret_cast(mean->ptr_to_element(Coordinates(0, 0))); - const auto input_var = reinterpret_cast(var->ptr_to_element(Coordinates(0, 0))); - const auto input_gamma = (gamma != nullptr) ? reinterpret_cast(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; - const auto input_beta = (beta != nullptr) ? reinterpret_cast(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_mean = reinterpret_cast(mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast(var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = + (gamma != nullptr) ? reinterpret_cast(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr; + const auto input_beta = + (beta != nullptr) ? reinterpret_cast(beta->ptr_to_element(Coordinates(0, 0))) : nullptr; const auto epsilon_vec = svdup_n_f32(epsilon); const auto const_1 = svdup_n_f32(1.f); const auto const_0 = svdup_n_f32(0.f); const auto va = svdup_n_f32(act_info.a()); const auto vb = svdup_n_f32(act_info.b()); - execute_window_loop(win_collapsed, [&](const Coordinates &) - { - const auto input_ptr = reinterpret_cast(input.ptr()); - const auto output_ptr = reinterpret_cast(output.ptr()); - - // Compute S elements per iteration - int x = window_start_x; - svbool_t pg = svwhilelt_b32(x, window_end_x); - do + execute_window_loop( + win_collapsed, + [&](const Coordinates &) { - // Conctruct vectors - const auto mean_vec = svld1_f32(pg, input_mean + x); - const auto var_vec = svld1_f32(pg, input_var + x); - const auto gamma_vec = (input_gamma != nullptr) ? svld1_f32(pg, input_gamma + x) : const_1; - const auto beta_vec = (input_beta != nullptr) ? svld1_f32(pg, input_beta + x) : const_0; + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); - // Calculate denominator - const auto tmp = svadd_f32_z(pg, var_vec, epsilon_vec); - auto denominator = svrsqrte_f32(tmp); - denominator = svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator); - denominator = svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator); + // Compute S elements per iteration + int x = window_start_x; + svbool_t pg = svwhilelt_b32(x, window_end_x); + do + { + // Conctruct vectors + const auto mean_vec = svld1_f32(pg, input_mean + x); + const auto var_vec = svld1_f32(pg, input_var + x); + const auto gamma_vec = (input_gamma != nullptr) ? svld1_f32(pg, input_gamma + x) : const_1; + const auto beta_vec = (input_beta != nullptr) ? svld1_f32(pg, input_beta + x) : const_0; - // Calculate x bar - const auto numerator = svsub_f32_z(pg, svld1_f32(pg, input_ptr + x), mean_vec); - const auto x_bar = svmul_f32_z(pg, numerator, denominator); - auto res = svmla_f32_z(pg, beta_vec, x_bar, gamma_vec); + // Calculate denominator + const auto tmp = svadd_f32_z(pg, var_vec, epsilon_vec); + auto denominator = svrsqrte_f32(tmp); + denominator = + svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator); + denominator = + svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator); - // Perform fused activation - if(act_info.enabled()) - { - if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) - { - res = svmax_f32_z(pg, const_0, res); - } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) - { - res = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, res)); - } - else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + // Calculate x bar + const auto numerator = svsub_f32_z(pg, svld1_f32(pg, input_ptr + x), mean_vec); + const auto x_bar = svmul_f32_z(pg, numerator, denominator); + auto res = svmla_f32_z(pg, beta_vec, x_bar, gamma_vec); + + // Perform fused activation + if (act_info.enabled()) { - res = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, res)); + if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU) + { + res = svmax_f32_z(pg, const_0, res); + } + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU) + { + res = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, res)); + } + else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) + { + res = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, res)); + } } - } - // Store results - svst1_f32(pg, output_ptr + x, res); + // Store results + svst1_f32(pg, output_ptr + x, res); - x += svcntw(); - pg = svwhilelt_b32(x, window_end_x); - } - while(svptest_any(svptrue_b32(), pg)); - }, - input, output); + x += svcntw(); + pg = svwhilelt_b32(x, window_end_x); + } while (svptest_any(svptrue_b32(), pg)); + }, + input, output); } } // namespace cpu } // namespace arm_compute diff --git a/src/core/NEON/kernels/batchnormalization/impl/list.h b/src/core/NEON/kernels/batchnormalization/impl/list.h index 8e0ea36f5a..cbf540bd71 100644 --- a/src/core/NEON/kernels/batchnormalization/impl/list.h +++ b/src/core/NEON/kernels/batchnormalization/impl/list.h @@ -28,9 +28,9 @@ namespace arm_compute { namespace cpu { -#define DECLARE_BATCH_NORMALIZATION_KERNEL(func_name) \ - void func_name(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, \ - float epsilon, ActivationLayerInfo &act_info, const Window &window) +#define DECLARE_BATCH_NORMALIZATION_KERNEL(func_name) \ + void func_name(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, \ + const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window) DECLARE_BATCH_NORMALIZATION_KERNEL(fp16_neon_batch_normalization); DECLARE_BATCH_NORMALIZATION_KERNEL(fp16_sve_batch_normalization); diff --git a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h index 3900ea62cd..95cdc8f2f9 100644 --- a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h +++ b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute @@ -158,8 +159,7 @@ struct logistic * * @param[in] act_info Activation layer information. */ - explicit logistic(ActivationLayerInfo act_info) - : vone(wrapper::vdup_n(static_cast(1), ExactTagType{})) + explicit logistic(ActivationLayerInfo act_info) : vone(wrapper::vdup_n(static_cast(1), ExactTagType{})) { ARM_COMPUTE_UNUSED(act_info); } @@ -198,8 +198,7 @@ struct relu * * @param[in] act_info Activation layer information. */ - explicit relu(ActivationLayerInfo act_info) - : vzero(wrapper::vdup_n(static_cast(0), ExactTagType{})) + explicit relu(ActivationLayerInfo act_info) : vzero(wrapper::vdup_n(static_cast(0), ExactTagType{})) { ARM_COMPUTE_UNUSED(act_info); } diff --git a/src/core/NEON/kernels/detail/NEColorConvertHelper.inl b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl index ac196d9dbb..50fff04cad 100644 --- a/src/core/NEON/kernels/detail/NEColorConvertHelper.inl +++ b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl @@ -25,6 +25,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IMultiImage.h" #include "arm_compute/core/Utils.h" + #include "src/core/NEON/NEMath.h" #include @@ -50,8 +51,12 @@ constexpr float rgb2u8_red_coef = 0.2126f; constexpr float rgb2u8_green_coef = 0.7152f; constexpr float rgb2u8_blue_coef = 0.0722f; -inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor, const float32x4_t &gcolor, const float32x4_t &bcolor, - const float rcoef, const float gcoef, const float bcoef) +inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor, + const float32x4_t &gcolor, + const float32x4_t &bcolor, + const float rcoef, + const float gcoef, + const float bcoef) { float32x4_t greyscale = vmulq_n_f32(rcolor, rcoef); greyscale = vmlaq_n_f32(greyscale, gcolor, gcoef); @@ -86,8 +91,12 @@ inline void rgb_to_u8_conversion(const uint8x16x3_t &in, uint8x16_t &out) arm_compute::convert_float32x4x4_to_uint8x16(out_float32, out); } -inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec, - float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec) +inline void rgb_to_yuv_calculation(const float32x4_t &rvec, + const float32x4_t &gvec, + const float32x4_t &bvec, + float32x4_t &yvec, + float32x4_t &uvec, + float32x4_t &vvec) { /* Y'= 0.2126*R' + 0.7152*G' + 0.0722*B' @@ -110,8 +119,12 @@ inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &g vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv); } -inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val, - float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha) +inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, + float32x4_t uvec_val, + const float32x4_t &yyvec_val, + float32x4_t vvec_val, + unsigned char *output_ptr, + const bool alpha) { float32x4x3_t rgb1, rgb2; @@ -126,8 +139,7 @@ inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uve // b = 1.8556f*f_u + 0.0000f*f_v; const auto red = vmulq_n_f32(vvec_val, red_coef_bt709); const auto blue = vmulq_n_f32(uvec_val, blue_coef_bt709); - const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709), - vmulq_n_f32(vvec_val, green_coef2_bt709)); + const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709), vmulq_n_f32(vvec_val, green_coef2_bt709)); // Compute the final r,g,b values using y1 for the first texel and y2 for the second one. // the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t @@ -144,7 +156,7 @@ inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uve uint8x8x3_t u8_rgb; arm_compute::convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb); - if(!alpha) + if (!alpha) { vst3_lane_u8(&output_ptr[0], u8_rgb, 0); vst3_lane_u8(&output_ptr[3], u8_rgb, 4); @@ -177,7 +189,7 @@ inline uint8x16x3_t load_rgb(const unsigned char *const ptr, const bool alpha) { uint8x16x3_t rgb; - if(alpha) + if (alpha) { const auto tmp = vld4q_u8(ptr); rgb.val[0] = tmp.val[0]; @@ -206,12 +218,12 @@ inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_botto float32x4x4_t fyvec_top, fuvec_top, fvvec_top; float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom; - for(auto i = 0; i < 4; ++i) + for (auto i = 0; i < 4; ++i) { - rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i], - fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]); - rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i], - fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]); + rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i], fyvec_top.val[i], fuvec_top.val[i], + fvvec_top.val[i]); + rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i], fyvec_bottom.val[i], + fuvec_bottom.val[i], fvvec_bottom.val[i]); } arm_compute::convert_float32x4x4_to_uint8x16(fyvec_top, vec_top.val[0]); @@ -222,9 +234,14 @@ inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_botto arm_compute::convert_float32x4x4_to_uint8x16(fvvec_bottom, vec_bottom.val[2]); } -inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top, - const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom, - unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom, +inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, + const uint8x16_t &gvec_top, + const uint8x16_t &bvec_top, + const uint8x16_t &rvec_bottom, + const uint8x16_t &gvec_bottom, + const uint8x16_t &bvec_bottom, + unsigned char *const __restrict out_y_top, + unsigned char *const __restrict out_y_bottom, unsigned char *const __restrict out_uv) { uint8x16x3_t vec_top, vec_bottom; @@ -252,9 +269,14 @@ inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec vst2_u8(out_uv, uvvec); } -inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top, - const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom, - unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom, +inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, + const uint8x16_t &gvec_top, + const uint8x16_t &bvec_top, + const uint8x16_t &rvec_bottom, + const uint8x16_t &gvec_bottom, + const uint8x16_t &bvec_bottom, + unsigned char *const __restrict out_y_top, + unsigned char *const __restrict out_y_bottom, unsigned char *const __restrict out_u, unsigned char *const __restrict out_v) { @@ -273,14 +295,16 @@ inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec const auto uvvec_top = vuzpq_u8(vec_top.val[1], vec_top.val[2]); const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]); - const auto uvvec = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]), - vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1])); + const auto uvvec = + vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]), vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1])); vst1_u8(out_u, vget_low_u8(uvvec)); vst1_u8(out_v, vget_high_u8(uvvec)); } -inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec, +inline void store_rgb_to_yuv4(const uint8x16_t &rvec, + const uint8x16_t &gvec, + const uint8x16_t &bvec, unsigned char *const __restrict out_y, unsigned char *const __restrict out_u, unsigned char *const __restrict out_v) @@ -291,10 +315,9 @@ inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, co const float32x4x4_t fbvec = arm_compute::convert_uint8x16_to_float32x4x4(bvec); float32x4x4_t fyvec, fuvec, fvvec; - for(auto i = 0; i < 4; ++i) + for (auto i = 0; i < 4; ++i) { - rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i], - fyvec.val[i], fuvec.val[i], fvvec.val[i]); + rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i], fyvec.val[i], fuvec.val[i], fvvec.val[i]); } uint8x16_t yvec, uvec, vvec; @@ -307,7 +330,7 @@ inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, co vst1q_u8(out_v, vvec); } #endif /* DOXYGEN_SKIP_THIS */ -} +} // namespace namespace arm_compute { @@ -329,17 +352,19 @@ void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict out Iterator in(input_ptr, win); Iterator out(output_ptr, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta1 = vld3q_u8(in.ptr()); - uint8x16x4_t ta2; - ta2.val[0] = ta1.val[0]; - ta2.val[1] = ta1.val[1]; - ta2.val[2] = ta1.val[2]; - ta2.val[3] = vdupq_n_u8(255); - vst4q_u8(out.ptr(), ta2); - }, - in, out); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta1 = vld3q_u8(in.ptr()); + uint8x16x4_t ta2; + ta2.val[0] = ta1.val[0]; + ta2.val[1] = ta1.val[1]; + ta2.val[2] = ta1.val[2]; + ta2.val[3] = vdupq_n_u8(255); + vst4q_u8(out.ptr(), ta2); + }, + in, out); } /** Convert RGB to U8. @@ -360,14 +385,16 @@ void colorconvert_rgb_to_u8(const void *__restrict input, void *__restrict outpu Iterator in(input_ptr, win); Iterator out(output_ptr, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta1 = vld3q_u8(in.ptr()); - uint8x16_t ta2; - rgb_to_u8_conversion(ta1, ta2); - vst1q_u8(out.ptr(), ta2); - }, - in, out); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta1 = vld3q_u8(in.ptr()); + uint8x16_t ta2; + rgb_to_u8_conversion(ta1, ta2); + vst1q_u8(out.ptr(), ta2); + }, + in, out); } /** Convert RGBX to RGB. @@ -388,16 +415,18 @@ void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win Iterator in(input_ptr, win); Iterator out(output_ptr, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta1 = vld4q_u8(in.ptr()); - uint8x16x3_t ta2; - ta2.val[0] = ta1.val[0]; - ta2.val[1] = ta1.val[1]; - ta2.val[2] = ta1.val[2]; - vst3q_u8(out.ptr(), ta2); - }, - in, out); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta1 = vld4q_u8(in.ptr()); + uint8x16x3_t ta2; + ta2.val[0] = ta1.val[0]; + ta2.val[1] = ta1.val[1]; + ta2.val[2] = ta1.val[2]; + vst3q_u8(out.ptr(), ta2); + }, + in, out); } /** Convert YUYV to RGB. @@ -422,26 +451,32 @@ void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict out Iterator in(input_ptr, win); Iterator out(output_ptr, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta = vld4q_u8(in.ptr()); - //ta.val[0] = Y0 Y2 Y4 Y6 ... - //ta.val[1] = U0 U2 U4 U6 ... - //ta.val[2] = Y1 Y3 Y5 Y7 ... - //ta.val[3] = V0 V2 V4 V7 ... - - // Convert the uint8x16x4_t to float32x4x4_t - const float32x4x4_t yvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]); - const float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]); - const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]); - const float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]); - - yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); - yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); - yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); - yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); - }, - in, out); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta = vld4q_u8(in.ptr()); + //ta.val[0] = Y0 Y2 Y4 Y6 ... + //ta.val[1] = U0 U2 U4 U6 ... + //ta.val[2] = Y1 Y3 Y5 Y7 ... + //ta.val[3] = V0 V2 V4 V7 ... + + // Convert the uint8x16x4_t to float32x4x4_t + const float32x4x4_t yvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]); + const float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]); + const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]); + const float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]); + + yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, + alpha); + yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, + alpha); + yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, + alpha); + yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, + alpha); + }, + in, out); } /** Convert NV12 to RGB. @@ -475,35 +510,45 @@ void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict out Iterator in_uv(input_ptr->plane(1), win_uv); Iterator out(output_ptr, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_y_top = vld2q_u8(in_y.ptr()); - const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); - const auto ta_uv = vld2q_u8(in_uv.ptr()); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_uv.val[0] = U0 U2 U4 U6 ... - //ta_uv.val[1] = V0 V2 V4 V6 ... - - // Convert the uint8x16x4_t to float32x4x4_t - float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]); - float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]); - float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]); - float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]); - float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]); - float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]); - - yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); - - yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha); - }, - in_y, in_uv, out); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_uv = vld2q_u8(in_uv.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + // Convert the uint8x16x4_t to float32x4x4_t + float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]); + float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]); + float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]); + float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]); + float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]); + float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]); + + yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], + out.ptr() + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], + out.ptr() + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], + out.ptr() + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], + out.ptr() + 3 * element_size, alpha); + + yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], + out.ptr() + out_stride + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], + out.ptr() + out_stride + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], + out.ptr() + out_stride + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], + out.ptr() + out_stride + 3 * element_size, alpha); + }, + in_y, in_uv, out); } /** Convert IYUV to RGB. @@ -537,59 +582,71 @@ void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict out Iterator in_v(input_ptr->plane(2), win_uv); Iterator out(output_ptr, win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto *y_top_ptr = in_y.ptr(); - const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y(); - const auto *u_ptr = in_u.ptr(); - const auto *v_ptr = in_v.ptr(); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto *y_top_ptr = in_y.ptr(); + const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y(); + const auto *u_ptr = in_u.ptr(); + const auto *v_ptr = in_v.ptr(); // Work-around issue in gcc 9(>=) where vld2q might cause issues with register allocation #if defined(__arch64__) - const auto ta0_y_top = vld1q_u8(y_top_ptr); - const auto ta1_y_top = vld1q_u8(y_top_ptr + 16); - const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr); - const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16); - const auto ta_u = vld1q_u8(u_ptr); - const auto ta_v = vld1q_u8(v_ptr); - - // Convert the uint8x16x4_t to float32x4x4_t - float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top)); - float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top)); - float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom)); - float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom)); - float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u); - float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v); + const auto ta0_y_top = vld1q_u8(y_top_ptr); + const auto ta1_y_top = vld1q_u8(y_top_ptr + 16); + const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr); + const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16); + const auto ta_u = vld1q_u8(u_ptr); + const auto ta_v = vld1q_u8(v_ptr); + + // Convert the uint8x16x4_t to float32x4x4_t + float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top)); + float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top)); + float32x4x4_t yvec_bottom = + arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom)); + float32x4x4_t yyvec_bottom = + arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom)); + float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u); + float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v); #else /* defined(__arch64__) */ - const auto ta_y_top = vld2q_u8(y_top_ptr); - const auto ta_y_bottom = vld2q_u8(y_bottom_ptr); - const auto ta_u = vld1q_u8(u_ptr); - const auto ta_v = vld1q_u8(v_ptr); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_u.val[0] = U0 U2 U4 U6 ... - //ta_v.val[0] = V0 V2 V4 V6 ... - - // Convert the uint8x16x4_t to float32x4x4_t - float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]); - float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]); - float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]); - float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]); - float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u); - float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v); + const auto ta_y_top = vld2q_u8(y_top_ptr); + const auto ta_y_bottom = vld2q_u8(y_bottom_ptr); + const auto ta_u = vld1q_u8(u_ptr); + const auto ta_v = vld1q_u8(v_ptr); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_u.val[0] = U0 U2 U4 U6 ... + //ta_v.val[0] = V0 V2 V4 V6 ... + + // Convert the uint8x16x4_t to float32x4x4_t + float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]); + float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]); + float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]); + float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]); + float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u); + float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v); #endif /* defined(__arch64__) */ - yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); - - yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha); - yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha); - }, - in_y, in_u, in_v, out); + yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], + out.ptr() + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], + out.ptr() + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], + out.ptr() + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], + out.ptr() + 3 * element_size, alpha); + + yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], + out.ptr() + out_stride + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], + out.ptr() + out_stride + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], + out.ptr() + out_stride + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], + out.ptr() + out_stride + 3 * element_size, alpha); + }, + in_y, in_u, in_v, out); } /** Convert YUYV to NV12. @@ -621,31 +678,33 @@ void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict ou Iterator out_y(output_ptr->plane(0), win); Iterator out_uv(output_ptr->plane(1), win_uv); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_top = vld4q_u8(in.ptr()); - const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y()); - //ta.val[0] = Y0 Y2 Y4 Y6 ... - //ta.val[1] = U0 U2 U4 U6 ... - //ta.val[2] = Y1 Y3 Y5 Y7 ... - //ta.val[3] = V0 V2 V4 V7 ... - - uint8x16x2_t yvec; - yvec.val[0] = ta_top.val[0 + shift]; - yvec.val[1] = ta_top.val[2 + shift]; - vst2q_u8(out_y.ptr(), yvec); - - uint8x16x2_t yyvec; - yyvec.val[0] = ta_bottom.val[0 + shift]; - yyvec.val[1] = ta_bottom.val[2 + shift]; - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec); - - uint8x16x2_t uvvec; - uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]); - uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]); - vst2q_u8(out_uv.ptr(), uvvec); - }, - in, out_y, out_uv); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_top = vld4q_u8(in.ptr()); + const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y()); + //ta.val[0] = Y0 Y2 Y4 Y6 ... + //ta.val[1] = U0 U2 U4 U6 ... + //ta.val[2] = Y1 Y3 Y5 Y7 ... + //ta.val[3] = V0 V2 V4 V7 ... + + uint8x16x2_t yvec; + yvec.val[0] = ta_top.val[0 + shift]; + yvec.val[1] = ta_top.val[2 + shift]; + vst2q_u8(out_y.ptr(), yvec); + + uint8x16x2_t yyvec; + yyvec.val[0] = ta_bottom.val[0 + shift]; + yyvec.val[1] = ta_bottom.val[2 + shift]; + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec); + + uint8x16x2_t uvvec; + uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]); + uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]); + vst2q_u8(out_uv.ptr(), uvvec); + }, + in, out_y, out_uv); } /** Convert IYUV to NV12. @@ -676,23 +735,25 @@ void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict ou Iterator out_y(output_ptr->plane(0), win); Iterator out_uv(output_ptr->plane(1), win_uv); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_y_top = vld2q_u8(in_y.ptr()); - const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); - uint8x16x2_t ta_uv; - ta_uv.val[0] = vld1q_u8(in_u.ptr()); - ta_uv.val[1] = vld1q_u8(in_v.ptr()); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_uv.val[0] = U0 U2 U4 U6 ... - //ta_uv.val[1] = V0 V2 V4 V6 ... - - vst2q_u8(out_y.ptr(), ta_y_top); - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); - vst2q_u8(out_uv.ptr(), ta_uv); - }, - in_y, in_u, in_v, out_y, out_uv); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + uint8x16x2_t ta_uv; + ta_uv.val[0] = vld1q_u8(in_u.ptr()); + ta_uv.val[1] = vld1q_u8(in_v.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + vst2q_u8(out_uv.ptr(), ta_uv); + }, + in_y, in_u, in_v, out_y, out_uv); } /** Convert NV12 to IYUV. @@ -726,22 +787,24 @@ void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict ou Iterator out_u(output_ptr->plane(1), win_uv); Iterator out_v(output_ptr->plane(2), win_uv); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_y_top = vld2q_u8(in_y.ptr()); - const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); - const auto ta_uv = vld2q_u8(in_uv.ptr()); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_uv.val[0] = U0 U2 U4 U6 ... - //ta_uv.val[1] = V0 V2 V4 V6 ... - - vst2q_u8(out_y.ptr(), ta_y_top); - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); - vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]); - vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]); - }, - in_y, in_uv, out_y, out_u, out_v); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_uv = vld2q_u8(in_uv.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]); + vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]); + }, + in_y, in_uv, out_y, out_u, out_v); } /** Convert YUYV to IYUV. @@ -774,34 +837,36 @@ void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict ou Iterator out_u(output_ptr->plane(1), win_uv); Iterator out_v(output_ptr->plane(2), win_uv); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_top = vld4q_u8(in.ptr()); - const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y()); - //ta.val[0] = Y0 Y2 Y4 Y6 ... - //ta.val[1] = U0 U2 U4 U6 ... - //ta.val[2] = Y1 Y3 Y5 Y7 ... - //ta.val[3] = V0 V2 V4 V7 ... - - uint8x16x2_t yvec; - yvec.val[0] = ta_top.val[0 + shift]; - yvec.val[1] = ta_top.val[2 + shift]; - vst2q_u8(out_y.ptr(), yvec); - - uint8x16x2_t yyvec; - yyvec.val[0] = ta_bottom.val[0 + shift]; - yyvec.val[1] = ta_bottom.val[2 + shift]; - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec); - - uint8x16_t uvec; - uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]); - vst1q_u8(out_u.ptr(), uvec); - - uint8x16_t vvec; - vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]); - vst1q_u8(out_v.ptr(), vvec); - }, - in, out_y, out_u, out_v); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_top = vld4q_u8(in.ptr()); + const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y()); + //ta.val[0] = Y0 Y2 Y4 Y6 ... + //ta.val[1] = U0 U2 U4 U6 ... + //ta.val[2] = Y1 Y3 Y5 Y7 ... + //ta.val[3] = V0 V2 V4 V7 ... + + uint8x16x2_t yvec; + yvec.val[0] = ta_top.val[0 + shift]; + yvec.val[1] = ta_top.val[2 + shift]; + vst2q_u8(out_y.ptr(), yvec); + + uint8x16x2_t yyvec; + yyvec.val[0] = ta_bottom.val[0 + shift]; + yyvec.val[1] = ta_bottom.val[2 + shift]; + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec); + + uint8x16_t uvec; + uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]); + vst1q_u8(out_u.ptr(), uvec); + + uint8x16_t vvec; + vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]); + vst1q_u8(out_v.ptr(), vvec); + }, + in, out_y, out_u, out_v); } /** Convert NV12 to YUV4. @@ -835,32 +900,34 @@ void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict ou Iterator out_u(output_ptr->plane(1), win); Iterator out_v(output_ptr->plane(2), win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_y_top = vld2q_u8(in_y.ptr()); - const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); - const auto ta_uv = vld2q_u8(in_uv.ptr()); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_uv.val[0] = U0 U2 U4 U6 ... - //ta_uv.val[1] = V0 V2 V4 V6 ... - - vst2q_u8(out_y.ptr(), ta_y_top); - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); - - uint8x16x2_t uvec; - uvec.val[0] = ta_uv.val[0 + shift]; - uvec.val[1] = ta_uv.val[0 + shift]; - vst2q_u8(out_u.ptr(), uvec); - vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec); - - uint8x16x2_t vvec; - vvec.val[0] = ta_uv.val[1 - shift]; - vvec.val[1] = ta_uv.val[1 - shift]; - vst2q_u8(out_v.ptr(), vvec); - vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec); - }, - in_y, in_uv, out_y, out_u, out_v); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_uv = vld2q_u8(in_uv.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + + uint8x16x2_t uvec; + uvec.val[0] = ta_uv.val[0 + shift]; + uvec.val[1] = ta_uv.val[0 + shift]; + vst2q_u8(out_u.ptr(), uvec); + vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec); + + uint8x16x2_t vvec; + vvec.val[0] = ta_uv.val[1 - shift]; + vvec.val[1] = ta_uv.val[1 - shift]; + vst2q_u8(out_v.ptr(), vvec); + vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec); + }, + in_y, in_uv, out_y, out_u, out_v); } /** Convert IYUV to YUV4. @@ -892,33 +959,35 @@ void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict ou Iterator out_u(output_ptr->plane(1), win); Iterator out_v(output_ptr->plane(2), win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_y_top = vld2q_u8(in_y.ptr()); - const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); - const auto ta_u = vld1q_u8(in_u.ptr()); - const auto ta_v = vld1q_u8(in_v.ptr()); - //ta_y.val[0] = Y0 Y2 Y4 Y6 ... - //ta_y.val[1] = Y1 Y3 Y5 Y7 ... - //ta_u = U0 U2 U4 U6 ... - //ta_v = V0 V2 V4 V6 ... - - vst2q_u8(out_y.ptr(), ta_y_top); - vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); - - uint8x16x2_t uvec; - uvec.val[0] = ta_u; - uvec.val[1] = ta_u; - vst2q_u8(out_u.ptr(), uvec); - vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec); - - uint8x16x2_t vvec; - vvec.val[0] = ta_v; - vvec.val[1] = ta_v; - vst2q_u8(out_v.ptr(), vvec); - vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec); - }, - in_y, in_u, in_v, out_y, out_u, out_v); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_u = vld1q_u8(in_u.ptr()); + const auto ta_v = vld1q_u8(in_v.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_u = U0 U2 U4 U6 ... + //ta_v = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + + uint8x16x2_t uvec; + uvec.val[0] = ta_u; + uvec.val[1] = ta_u; + vst2q_u8(out_u.ptr(), uvec); + vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec); + + uint8x16x2_t vvec; + vvec.val[0] = ta_v; + vvec.val[1] = ta_v; + vst2q_u8(out_v.ptr(), vvec); + vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec); + }, + in_y, in_u, in_v, out_y, out_u, out_v); } /** Convert RGB to NV12. @@ -948,20 +1017,21 @@ void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict out Iterator out_y(output_ptr->plane(0), win); Iterator out_uv(output_ptr->plane(1), win_uv); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_rgb_top = load_rgb(in.ptr(), alpha); - const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha); - //ta_rgb.val[0] = R0 R1 R2 R3 ... - //ta_rgb.val[1] = G0 G1 G2 G3 ... - //ta_rgb.val[2] = B0 B1 B2 B3 ... - - store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], - ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], - out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), - out_uv.ptr()); - }, - in, out_y, out_uv); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_rgb_top = load_rgb(in.ptr(), alpha); + const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha); + //ta_rgb.val[0] = R0 R1 R2 R3 ... + //ta_rgb.val[1] = G0 G1 G2 G3 ... + //ta_rgb.val[2] = B0 B1 B2 B3 ... + + store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], ta_rgb_bottom.val[0], + ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], out_y.ptr(), + out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), out_uv.ptr()); + }, + in, out_y, out_uv); } /** Convert RGB to IYUV. @@ -992,20 +1062,22 @@ void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict out Iterator out_u(output_ptr->plane(1), win_uv); Iterator out_v(output_ptr->plane(2), win_uv); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_rgb_top = load_rgb(in.ptr(), alpha); - const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha); - //ta_rgb.val[0] = R0 R1 R2 R3 ... - //ta_rgb.val[1] = G0 G1 G2 G3 ... - //ta_rgb.val[2] = B0 B1 B2 B3 ... - - store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], - ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], - out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), - out_u.ptr(), out_v.ptr()); - }, - in, out_y, out_u, out_v); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_rgb_top = load_rgb(in.ptr(), alpha); + const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha); + //ta_rgb.val[0] = R0 R1 R2 R3 ... + //ta_rgb.val[1] = G0 G1 G2 G3 ... + //ta_rgb.val[2] = B0 B1 B2 B3 ... + + store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], ta_rgb_bottom.val[0], + ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], out_y.ptr(), + out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), out_u.ptr(), + out_v.ptr()); + }, + in, out_y, out_u, out_v); } /** Convert RGB to YUV4. @@ -1030,16 +1102,17 @@ void colorconvert_rgb_to_yuv4(const void *__restrict input, void *__restrict out Iterator out_u(output_ptr->plane(1), win); Iterator out_v(output_ptr->plane(2), win); - execute_window_loop(win, [&](const Coordinates &) - { - const auto ta_rgb = load_rgb(in.ptr(), alpha); - //ta_rgb.val[0] = R0 R1 R2 R3 ... - //ta_rgb.val[1] = G0 G1 G2 G3 ... - //ta_rgb.val[2] = B0 B1 B2 B3 ... - - store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2], - out_y.ptr(), out_u.ptr(), out_v.ptr()); - }, - in, out_y, out_u, out_v); + execute_window_loop( + win, + [&](const Coordinates &) + { + const auto ta_rgb = load_rgb(in.ptr(), alpha); + //ta_rgb.val[0] = R0 R1 R2 R3 ... + //ta_rgb.val[1] = G0 G1 G2 G3 ... + //ta_rgb.val[2] = B0 B1 B2 B3 ... + + store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2], out_y.ptr(), out_u.ptr(), out_v.ptr()); + }, + in, out_y, out_u, out_v); } } // namespace arm_compute diff --git a/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h index 96defbc9c9..4b1eb079b2 100644 --- a/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h +++ b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h @@ -33,56 +33,32 @@ namespace detail { inline float32x4x3_t load_matrix_row(const float *ptr) { - const float32x4x3_t r = - { - { - vld1q_dup_f32(ptr), - vld1q_dup_f32(1 + ptr), - vld1q_dup_f32(2 + ptr) - } - }; + const float32x4x3_t r = {{vld1q_dup_f32(ptr), vld1q_dup_f32(1 + ptr), vld1q_dup_f32(2 + ptr)}}; return r; } template -float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2); +float32x4x2_t convolve_3x3(const float *in_top, + const float *in_mid, + const float *in_low, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2); template <> -inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) +inline float32x4x2_t convolve_3x3<1>(const float *in_top, + const float *in_mid, + const float *in_low, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2) { - const float32x4x3_t vtop = - { - { - vld1q_f32(in_top), - vld1q_f32(in_top + 4), - vld1q_f32(in_top + 8) - } - }; - const float32x4x3_t vmid = - { - { - vld1q_f32(in_mid), - vld1q_f32(in_mid + 4), - vld1q_f32(in_mid + 8) - } - }; - const float32x4x3_t vlow = - { - { - vld1q_f32(in_low), - vld1q_f32(in_low + 4), - vld1q_f32(in_low + 8) - } - }; - float32x4x2_t out = - { - { - vmulq_f32(vtop.val[0], m0.val[0]), - vmulq_f32(vtop.val[1], m0.val[0]) - } - }; - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]); - out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]); + const float32x4x3_t vtop = {{vld1q_f32(in_top), vld1q_f32(in_top + 4), vld1q_f32(in_top + 8)}}; + const float32x4x3_t vmid = {{vld1q_f32(in_mid), vld1q_f32(in_mid + 4), vld1q_f32(in_mid + 8)}}; + const float32x4x3_t vlow = {{vld1q_f32(in_low), vld1q_f32(in_low + 4), vld1q_f32(in_low + 8)}}; + float32x4x2_t out = {{vmulq_f32(vtop.val[0], m0.val[0]), vmulq_f32(vtop.val[1], m0.val[0])}}; + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]); out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]); out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]); @@ -106,7 +82,12 @@ inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, c } template <> -inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) +inline float32x4x2_t convolve_3x3<2>(const float *in_top, + const float *in_mid, + const float *in_low, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2) { float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); @@ -116,7 +97,12 @@ inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, c } template <> -inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) +inline float32x4x2_t convolve_3x3<3>(const float *in_top, + const float *in_mid, + const float *in_low, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2) { float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); @@ -165,6 +151,6 @@ int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteratio { return num_elems_written_per_iteration * 3; } -} +} // namespace detail } // namespace arm_compute -#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */ \ No newline at end of file +#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */ diff --git a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h index 7ba52a16b7..fd1ee54597 100644 --- a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h +++ b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h @@ -45,14 +45,7 @@ namespace detail inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0) { ARM_COMPUTE_UNUSED(weights_offset); - const float32x4x3_t r = - { - { - vld1q_dup_f32(ptr), - vld1q_dup_f32(1 + ptr), - vld1q_dup_f32(2 + ptr) - } - }; + const float32x4x3_t r = {{vld1q_dup_f32(ptr), vld1q_dup_f32(1 + ptr), vld1q_dup_f32(2 + ptr)}}; return r; } @@ -63,21 +56,16 @@ inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0) * * @return The loaded matrix. */ -template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same::value || std::is_same::value) > +template ::value || std::is_same::value)> inline int32x4x3_t load_matrix_row(const T *ptr, int weights_offset = 0) { const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset); /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes: r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */ - int32x4x3_t r = - { - { - vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)), - vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))), - vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2))) - } - }; + int32x4x3_t r = {{vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)), + vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))), + vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))}}; return r; } @@ -245,36 +233,23 @@ inline void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values * @param[in] input_offset (Optional) Input quantization offset. * */ -inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low, - const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - const size_t dilation_x, int input_offset) +inline float32x4_t single_convolve_3x3_dilation(const float *in_top, + const float *in_mid, + const float *in_low, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2, + const size_t dilation_x, + int input_offset) { ARM_COMPUTE_UNUSED(input_offset); - const float32x4x3_t vtop = - { - { - vld1q_f32(in_top), - vld1q_f32(in_top + dilation_x), - vld1q_f32(in_top + 2 * dilation_x) - } - }; - const float32x4x3_t vmid = - { - { - vld1q_f32(in_mid), - vld1q_f32(in_mid + dilation_x), - vld1q_f32(in_mid + 2 * dilation_x) - } - }; - const float32x4x3_t vlow = - { - { - vld1q_f32(in_low), - vld1q_f32(in_low + dilation_x), - vld1q_f32(in_low + 2 * dilation_x) - } - }; + const float32x4x3_t vtop = { + {vld1q_f32(in_top), vld1q_f32(in_top + dilation_x), vld1q_f32(in_top + 2 * dilation_x)}}; + const float32x4x3_t vmid = { + {vld1q_f32(in_mid), vld1q_f32(in_mid + dilation_x), vld1q_f32(in_mid + 2 * dilation_x)}}; + const float32x4x3_t vlow = { + {vld1q_f32(in_low), vld1q_f32(in_low + dilation_x), vld1q_f32(in_low + 2 * dilation_x)}}; float32x4_t out = vmulq_f32(vtop.val[0], m0.val[0]); out = vmlaq_f32(out, vtop.val[1], m0.val[1]); out = vmlaq_f32(out, vtop.val[2], m0.val[2]); @@ -303,26 +278,28 @@ inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float * @param[in] input_offset (Optional) Input quantization offset. * */ -inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low, - const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - const size_t dilation_x, unsigned int stridex, int input_offset = 0) +inline float32x4x2_t convolve_3x3_dilation(const float *in_top, + const float *in_mid, + const float *in_low, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2, + const size_t dilation_x, + unsigned int stridex, + int input_offset = 0) { ARM_COMPUTE_ERROR_ON(stridex > 3); - float32x4x2_t out = - { - { - single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), - single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset) - } - }; + float32x4x2_t out = { + {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), + single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)}}; - if(stridex == 2) + if (stridex == 2) { out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2); out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3); } - else if(stridex == 3) + else if (stridex == 3) { out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); } @@ -344,26 +321,32 @@ inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_ * */ template -void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr, - const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - unsigned int stridex, int input_offset = 0); +void convolve_3x3(const float *in_top, + const float *in_mid, + const float *in_low, + float *out_ptr, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2, + unsigned int stridex, + int input_offset = 0); template -inline void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr, - const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, - unsigned int stridex, int input_offset) +inline void convolve_3x3(const float *in_top, + const float *in_mid, + const float *in_low, + float *out_ptr, + const float32x4x3_t &m0, + const float32x4x3_t &m1, + const float32x4x3_t &m2, + unsigned int stridex, + int input_offset) { ARM_COMPUTE_UNUSED(input_offset); ARM_COMPUTE_ERROR_ON(stridex > 3); - float32x4x2_t out = - { - { - vdupq_n_f32(0.f), - vdupq_n_f32(0.f) - } - }; - if(stridex == 2) + float32x4x2_t out = {{vdupq_n_f32(0.f), vdupq_n_f32(0.f)}}; + if (stridex == 2) { const float32x4x2_t vtop = vld2q_f32(in_top); const float32x4x2_t vmid = vld2q_f32(in_mid); @@ -389,32 +372,11 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float * } else { - const float32x4x3_t vtop = - { - { - vld1q_f32(in_top), - vld1q_f32(in_top + 4), - vld1q_f32(in_top + 8) - } - }; - const float32x4x3_t vmid = - { - { - vld1q_f32(in_mid), - vld1q_f32(in_mid + 4), - vld1q_f32(in_mid + 8) - } - }; - const float32x4x3_t vlow = - { - { - vld1q_f32(in_low), - vld1q_f32(in_low + 4), - vld1q_f32(in_low + 8) - } - }; - out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]); - out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]); + const float32x4x3_t vtop = {{vld1q_f32(in_top), vld1q_f32(in_top + 4), vld1q_f32(in_top + 8)}}; + const float32x4x3_t vmid = {{vld1q_f32(in_mid), vld1q_f32(in_mid + 4), vld1q_f32(in_mid + 8)}}; + const float32x4x3_t vlow = {{vld1q_f32(in_low), vld1q_f32(in_low + 4), vld1q_f32(in_low + 8)}}; + out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]); + out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]); out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]); out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]); @@ -438,7 +400,7 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float * out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]); out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]); - if(stridex == 3) + if (stridex == 3) { out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out); @@ -462,65 +424,43 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float * * @param[in] input_offset Input quantization offset. * */ -template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same::value || std::is_same::value) > -inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, - const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, - size_t dilation_x, int32_t input_offset) +template ::value || std::is_same::value)> +inline int32x4_t single_convolve_3x3_dilation(const T *in_top, + const T *in_mid, + const T *in_low, + const int32x4x3_t &m0, + const int32x4x3_t &m1, + const int32x4x3_t &m2, + size_t dilation_x, + int32_t input_offset) { using VectorType = typename std::conditional::value, uint8x8x3_t, int8x8x3_t>::type; using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t; const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{}); - const VectorType vtop = - { - { - wrapper::vload(in_top), - wrapper::vload(in_top + dilation_x), - wrapper::vload(in_top + 2 * dilation_x) - } - }; - const VectorType vmid = - { - { - wrapper::vload(in_mid), - wrapper::vload(in_mid + dilation_x), - wrapper::vload(in_mid + 2 * dilation_x) - } - }; - const VectorType vlow = - { - { - wrapper::vload(in_low), - wrapper::vload(in_low + dilation_x), - wrapper::vload(in_low + 2 * dilation_x) - } - }; - - const int32x4x3_t vtop_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))), - } - }; - const int32x4x3_t vmid_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))), - } - }; - const int32x4x3_t vlow_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))), - } - }; + const VectorType vtop = { + {wrapper::vload(in_top), wrapper::vload(in_top + dilation_x), wrapper::vload(in_top + 2 * dilation_x)}}; + const VectorType vmid = { + {wrapper::vload(in_mid), wrapper::vload(in_mid + dilation_x), wrapper::vload(in_mid + 2 * dilation_x)}}; + const VectorType vlow = { + {wrapper::vload(in_low), wrapper::vload(in_low + dilation_x), wrapper::vload(in_low + 2 * dilation_x)}}; + + const int32x4x3_t vtop_s32 = {{ + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))), + }}; + const int32x4x3_t vmid_s32 = {{ + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))), + }}; + const int32x4x3_t vlow_s32 = {{ + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))), + }}; int32x4_t out = wrapper::vmul(vtop_s32.val[0], m0.val[0]); out = wrapper::vmla(out, vtop_s32.val[1], m0.val[1]); @@ -550,26 +490,29 @@ inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid, * @param[in] input_offset Input quantization offset. * */ -template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same::value || std::is_same::value) > -inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, - const size_t dilation_x, unsigned int stridex, int input_offset) +template ::value || std::is_same::value)> +inline int32x4x2_t convolve_3x3_dilation(const T *in_top, + const T *in_mid, + const T *in_low, + const int32x4x3_t &m0, + const int32x4x3_t &m1, + const int32x4x3_t &m2, + const size_t dilation_x, + unsigned int stridex, + int input_offset) { ARM_COMPUTE_ERROR_ON(stridex > 3); - int32x4x2_t out = - { - { - single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), - single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset) - } - }; + int32x4x2_t out = { + {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), + single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)}}; - if(stridex == 2) + if (stridex == 2) { out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1); out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2); out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3); } - else if(stridex == 3) + else if (stridex == 3) { out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1); } @@ -589,10 +532,19 @@ inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const * @param[in] input_offset Input quantization offset. * */ -template < bool accumulate, typename T1, typename T2, ARM_COMPUTE_REQUIRES_TA(std::is_same::value || std::is_same::value) > -void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ptr, - const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, - unsigned int stridex, int32_t input_offset) +template ::value || std::is_same::value)> +void convolve_3x3(const T1 *in_top, + const T1 *in_mid, + const T1 *in_low, + T2 *out_ptr, + const int32x4x3_t &m0, + const int32x4x3_t &m1, + const int32x4x3_t &m2, + unsigned int stridex, + int32_t input_offset) { ARM_COMPUTE_ERROR_ON(stridex > 3); using VectorType = typename std::conditional::value, uint8x8x2_t, int8x8x2_t>::type; @@ -600,60 +552,30 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{}); - const VectorType vtop = - { - { - wrapper::vload(in_top), - wrapper::vload(in_top + 8) - } - }; - const VectorType vmid = - { - { - wrapper::vload(in_mid), - wrapper::vload(in_mid + 8) - } - }; - const VectorType vlow = - { - { - wrapper::vload(in_low), - wrapper::vload(in_low + 8) - } - }; - - const int32x4x3_t vtop_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))), - } - }; - const int32x4x3_t vmid_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))), - } - }; - const int32x4x3_t vlow_s32 = - { - { - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))), - wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))), - } - }; - - int32x4x2_t out - { - { - wrapper::vdup_n(static_cast(0), OutputTagType{}), - wrapper::vdup_n(static_cast(0), OutputTagType{}), - } - }; + const VectorType vtop = {{wrapper::vload(in_top), wrapper::vload(in_top + 8)}}; + const VectorType vmid = {{wrapper::vload(in_mid), wrapper::vload(in_mid + 8)}}; + const VectorType vlow = {{wrapper::vload(in_low), wrapper::vload(in_low + 8)}}; + + const int32x4x3_t vtop_s32 = {{ + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))), + }}; + const int32x4x3_t vmid_s32 = {{ + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))), + }}; + const int32x4x3_t vlow_s32 = {{ + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))), + }}; + + int32x4x2_t out{{ + wrapper::vdup_n(static_cast(0), OutputTagType{}), + wrapper::vdup_n(static_cast(0), OutputTagType{}), + }}; // 0 out.val[0] = wrapper::vmla(out.val[0], vtop_s32.val[0], m0.val[0]); @@ -681,11 +603,11 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vlow_s32.val[1], vlow_s32.val[2]), m2.val[1]); out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vlow_s32.val[1], vlow_s32.val[2]), m2.val[2]); - if(stridex == 1) + if (stridex == 1) { accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out); } - else if(stridex == 2) + else if (stridex == 2) { out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1); out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2); @@ -693,7 +615,7 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out); } - else if(stridex == 3) + else if (stridex == 3) { out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1); accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out); @@ -712,14 +634,7 @@ inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset = ARM_COMPUTE_UNUSED(weights_offset); /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes: r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */ - const float16x8x3_t r = - { - { - vld1q_dup_f16(ptr), - vld1q_dup_f16(1 + ptr), - vld1q_dup_f16(2 + ptr) - } - }; + const float16x8x3_t r = {{vld1q_dup_f16(ptr), vld1q_dup_f16(1 + ptr), vld1q_dup_f16(2 + ptr)}}; return r; } @@ -735,35 +650,22 @@ inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset = * @param[in] input_offset (Optional)Input quantization offset. * */ -inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, - const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, - const size_t dilation_x, int input_offset = 0) +inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, + const float16_t *in_mid, + const float16_t *in_low, + const float16x8x3_t &m0, + const float16x8x3_t &m1, + const float16x8x3_t &m2, + const size_t dilation_x, + int input_offset = 0) { ARM_COMPUTE_UNUSED(input_offset); - const float16x8x3_t vtop = - { - { - vld1q_f16(in_top), - vld1q_f16(in_top + dilation_x), - vld1q_f16(in_top + 2 * dilation_x) - } - }; - const float16x8x3_t vmid = - { - { - vld1q_f16(in_mid), - vld1q_f16(in_mid + dilation_x), - vld1q_f16(in_mid + 2 * dilation_x) - } - }; - const float16x8x3_t vlow = - { - { - vld1q_f16(in_low), - vld1q_f16(in_low + dilation_x), - vld1q_f16(in_low + 2 * dilation_x) - } - }; + const float16x8x3_t vtop = { + {vld1q_f16(in_top), vld1q_f16(in_top + dilation_x), vld1q_f16(in_top + 2 * dilation_x)}}; + const float16x8x3_t vmid = { + {vld1q_f16(in_mid), vld1q_f16(in_mid + dilation_x), vld1q_f16(in_mid + 2 * dilation_x)}}; + const float16x8x3_t vlow = { + {vld1q_f16(in_low), vld1q_f16(in_low + dilation_x), vld1q_f16(in_low + 2 * dilation_x)}}; float16x8_t out = vmulq_f16(vtop.val[0], m0.val[0]); out = vaddq_f16(out, vmulq_f16(vtop.val[1], m0.val[1])); out = vaddq_f16(out, vmulq_f16(vtop.val[2], m0.val[2])); @@ -792,19 +694,21 @@ inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const f * @param[in] input_offset (Optional) Input quantization offset. * */ -inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, - const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, - const size_t dilation_x, unsigned int stridex, int input_offset = 0) -{ - float16x8x2_t out = - { - { - single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), - single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset) - } - }; - - if(stridex == 2) +inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, + const float16_t *in_mid, + const float16_t *in_low, + const float16x8x3_t &m0, + const float16x8x3_t &m1, + const float16x8x3_t &m2, + const size_t dilation_x, + unsigned int stridex, + int input_offset = 0) +{ + float16x8x2_t out = { + {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), + single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset)}}; + + if (stridex == 2) { out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1); out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 4), out.val[0], 2); @@ -814,7 +718,7 @@ inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float1 out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 4), out.val[0], 6); out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 6), out.val[0], 7); } - else if(stridex == 3) + else if (stridex == 3) { out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1); out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2); @@ -838,20 +742,20 @@ inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float1 * */ template -inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, float16_t *out_ptr, - const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, - unsigned int stridex, int input_offset = 0) +inline void convolve_3x3(const float16_t *in_top, + const float16_t *in_mid, + const float16_t *in_low, + float16_t *out_ptr, + const float16x8x3_t &m0, + const float16x8x3_t &m1, + const float16x8x3_t &m2, + unsigned int stridex, + int input_offset = 0) { ARM_COMPUTE_UNUSED(input_offset); - float16x8x2_t out = - { - { - vdupq_n_f16(0), - vdupq_n_f16(0) - } - }; - if(stridex == 2) + float16x8x2_t out = {{vdupq_n_f16(0), vdupq_n_f16(0)}}; + if (stridex == 2) { const float16x8x2_t vtop = vld2q_f16(in_top); const float16x8x2_t vmid = vld2q_f16(in_mid); @@ -877,32 +781,11 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const } else { - const float16x8x3_t vtop = - { - { - vld1q_f16(in_top), - vld1q_f16(in_top + 8), - vld1q_f16(in_top + 16) - } - }; - const float16x8x3_t vmid = - { - { - vld1q_f16(in_mid), - vld1q_f16(in_mid + 8), - vld1q_f16(in_mid + 16) - } - }; - const float16x8x3_t vlow = - { - { - vld1q_f16(in_low), - vld1q_f16(in_low + 8), - vld1q_f16(in_low + 16) - } - }; - out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]); - out.val[1] = vmulq_f16(vtop.val[1], m0.val[0]); + const float16x8x3_t vtop = {{vld1q_f16(in_top), vld1q_f16(in_top + 8), vld1q_f16(in_top + 16)}}; + const float16x8x3_t vmid = {{vld1q_f16(in_mid), vld1q_f16(in_mid + 8), vld1q_f16(in_mid + 16)}}; + const float16x8x3_t vlow = {{vld1q_f16(in_low), vld1q_f16(in_low + 8), vld1q_f16(in_low + 16)}}; + out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]); + out.val[1] = vmulq_f16(vtop.val[1], m0.val[0]); out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1])); out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2])); @@ -921,7 +804,7 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1])); out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2])); - if(stridex == 3) + if (stridex == 3) { out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1); out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2); @@ -946,7 +829,7 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const */ inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration, unsigned int stridex) { - switch(stridex) + switch (stridex) { case 1: return num_elems_written_per_iteration; @@ -959,6 +842,6 @@ inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iter return 0; } } -} +} // namespace detail } // namespace arm_compute #endif /* ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H */ -- cgit v1.2.1