aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels
diff options
context:
space:
mode:
authorFelix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>2023-09-27 17:46:17 +0100
committerfelixjohnny.thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>2023-09-28 12:08:05 +0000
commitafd38f0c617d6f89b2b4532c6c44f116617e2b6f (patch)
tree03bc7d5a762099989b16a656fa8d397b490ed70e /src/core/NEON/kernels
parentbdcb4c148ee2fdeaaddf4cf1e57bbb0de02bb894 (diff)
downloadComputeLibrary-afd38f0c617d6f89b2b4532c6c44f116617e2b6f.tar.gz
Apply clang-format on repository
Code is formatted as per a revised clang format configuration file(not part of this delivery). Version 14.0.6 is used. Exclusion List: - files with .cl extension - files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...) And the following directories - compute_kernel_writer/validation/ - tests/ - include/ - src/core/NEON/kernels/convolution/ - src/core/NEON/kernels/arm_gemm/ - src/core/NEON/kernels/arm_conv/ - data/ There will be a follow up for formatting of .cl files and the files under tests/ and compute_kernel_writer/validation/. Signed-off-by: Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Diffstat (limited to 'src/core/NEON/kernels')
-rw-r--r--src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp302
-rw-r--r--src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h21
-rw-r--r--src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp127
-rw-r--r--src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h13
-rw-r--r--src/core/NEON/kernels/NEBitwiseAndKernel.cpp17
-rw-r--r--src/core/NEON/kernels/NEBitwiseNotKernel.cpp14
-rw-r--r--src/core/NEON/kernels/NEBitwiseOrKernel.cpp18
-rw-r--r--src/core/NEON/kernels/NEBitwiseXorKernel.cpp18
-rw-r--r--src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp68
-rw-r--r--src/core/NEON/kernels/NEBoundingBoxTransformKernel.h8
-rw-r--r--src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp97
-rw-r--r--src/core/NEON/kernels/NECol2ImKernel.h4
-rw-r--r--src/core/NEON/kernels/NECropKernel.cpp238
-rw-r--r--src/core/NEON/kernels/NECropKernel.h19
-rw-r--r--src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp76
-rw-r--r--src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp149
-rw-r--r--src/core/NEON/kernels/NEFFTDigitReverseKernel.h6
-rw-r--r--src/core/NEON/kernels/NEFFTRadixStageKernel.cpp594
-rw-r--r--src/core/NEON/kernels/NEFFTRadixStageKernel.h14
-rw-r--r--src/core/NEON/kernels/NEFFTScaleKernel.cpp21
-rw-r--r--src/core/NEON/kernels/NEFFTScaleKernel.h4
-rw-r--r--src/core/NEON/kernels/NEFillBorderKernel.cpp225
-rw-r--r--src/core/NEON/kernels/NEFillBorderKernel.h11
-rw-r--r--src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp244
-rw-r--r--src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h39
-rw-r--r--src/core/NEON/kernels/NEGatherKernel.cpp80
-rw-r--r--src/core/NEON/kernels/NEGatherKernel.h5
-rw-r--r--src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp48
-rw-r--r--src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h2
-rw-r--r--src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp57
-rw-r--r--src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h8
-rw-r--r--src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp59
-rw-r--r--src/core/NEON/kernels/NEL2NormalizeLayerKernel.h3
-rw-r--r--src/core/NEON/kernels/NELogicalKernel.cpp91
-rw-r--r--src/core/NEON/kernels/NELogicalKernel.h5
-rw-r--r--src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp54
-rw-r--r--src/core/NEON/kernels/NENormalizationLayerKernel.cpp144
-rw-r--r--src/core/NEON/kernels/NENormalizationLayerKernel.h8
-rw-r--r--src/core/NEON/kernels/NEPadLayerKernel.cpp106
-rw-r--r--src/core/NEON/kernels/NEPadLayerKernel.h13
-rw-r--r--src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp166
-rw-r--r--src/core/NEON/kernels/NEPriorBoxLayerKernel.h14
-rw-r--r--src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp118
-rw-r--r--src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h33
-rw-r--r--src/core/NEON/kernels/NEROIAlignLayerKernel.cpp79
-rw-r--r--src/core/NEON/kernels/NEROIAlignLayerKernel.h5
-rw-r--r--src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp85
-rw-r--r--src/core/NEON/kernels/NEROIPoolingLayerKernel.h8
-rw-r--r--src/core/NEON/kernels/NERangeKernel.cpp90
-rw-r--r--src/core/NEON/kernels/NERangeKernel.h1
-rw-r--r--src/core/NEON/kernels/NEReductionOperationKernel.cpp1955
-rw-r--r--src/core/NEON/kernels/NEReductionOperationKernel.h3
-rw-r--r--src/core/NEON/kernels/NEReorderKernel.cpp70
-rw-r--r--src/core/NEON/kernels/NEReorderKernel.h33
-rw-r--r--src/core/NEON/kernels/NEReorgLayerKernel.cpp56
-rw-r--r--src/core/NEON/kernels/NEReverseKernel.cpp98
-rw-r--r--src/core/NEON/kernels/NEReverseKernel.h3
-rw-r--r--src/core/NEON/kernels/NESelectKernel.cpp156
-rw-r--r--src/core/NEON/kernels/NESelectKernel.h2
-rw-r--r--src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp161
-rw-r--r--src/core/NEON/kernels/NESpaceToBatchLayerKernel.h20
-rw-r--r--src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp59
-rw-r--r--src/core/NEON/kernels/NESpaceToDepthLayerKernel.h1
-rw-r--r--src/core/NEON/kernels/NEStackLayerKernel.cpp55
-rw-r--r--src/core/NEON/kernels/NEStackLayerKernel.h10
-rw-r--r--src/core/NEON/kernels/NEStridedSliceKernel.cpp115
-rw-r--r--src/core/NEON/kernels/NEStridedSliceKernel.h23
-rw-r--r--src/core/NEON/kernels/NETileKernel.cpp47
-rw-r--r--src/core/NEON/kernels/assembly/depthwise.hpp270
-rw-r--r--src/core/NEON/kernels/assembly/depthwise_common.hpp106
-rw-r--r--src/core/NEON/kernels/assembly/pool_common.hpp71
-rw-r--r--src/core/NEON/kernels/assembly/pooling.hpp210
-rw-r--r--src/core/NEON/kernels/assembly/premultiply.hpp17
-rw-r--r--src/core/NEON/kernels/assembly/winograd.hpp181
-rw-r--r--src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp166
-rw-r--r--src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp166
-rw-r--r--src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp115
-rw-r--r--src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp115
-rw-r--r--src/core/NEON/kernels/batchnormalization/impl/list.h6
-rw-r--r--src/core/NEON/kernels/detail/NEActivationFunctionDetail.h7
-rw-r--r--src/core/NEON/kernels/detail/NEColorConvertHelper.inl735
-rw-r--r--src/core/NEON/kernels/detail/NEDirectConvolution3x3.h80
-rw-r--r--src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h507
83 files changed, 4997 insertions, 4251 deletions
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 108b199df7..deb89996a9 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -28,18 +28,17 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
+#include "src/core/NEON/kernels/batchnormalization/impl/list.h"
#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/NEON/kernels/batchnormalization/impl/list.h"
-#include "src/core/common/Registrars.h"
-
#include <map>
namespace arm_compute
@@ -52,8 +51,15 @@ struct BatchNormalizationSelectorData
const CPUInfo &ci;
};
using BatchNormalizationSelectorPtr = std::add_pointer<bool(const BatchNormalizationSelectorData &data)>::type;
-using BatchNormalizationKernelPtr = std::add_pointer<void(ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const ITensor *,
- float, ActivationLayerInfo &, const Window &)>::type;
+using BatchNormalizationKernelPtr = std::add_pointer<void(ITensor *,
+ ITensor *,
+ const ITensor *,
+ const ITensor *,
+ const ITensor *,
+ const ITensor *,
+ float,
+ ActivationLayerInfo &,
+ const Window &)>::type;
struct BatchNormalizationKernel
{
@@ -62,41 +68,32 @@ struct BatchNormalizationKernel
BatchNormalizationKernelPtr ukernel;
};
-static const BatchNormalizationKernel available_kernels[] =
-{
+static const BatchNormalizationKernel available_kernels[] = {
#if defined(ARM_COMPUTE_ENABLE_SVE)
- {
- "sve_fp16_batch_normalization",
- [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
- REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_batch_normalization)
- },
- {
- "sve_fp32_batch_normalization",
- [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
- REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization)
- },
+ {"sve_fp16_batch_normalization",
+ [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
+ REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_batch_normalization)},
+ {"sve_fp32_batch_normalization",
+ [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
+ REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization)},
#endif /* !defined(ARM_COMPUTE_ENABLE_SVE) */
#if defined(ARM_COMPUTE_ENABLE_NEON)
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
- {
- "neon_fp16_batch_normalization",
- [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16; },
- REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_batch_normalization)
- },
+ {"neon_fp16_batch_normalization",
+ [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_batch_normalization)},
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
- {
- "neon_fp32_batch_normalization",
- [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization)
- },
+ {"neon_fp32_batch_normalization",
+ [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization)},
#endif /* !defined(ARM_COMPUTE_ENABLE_NEON) */
};
const BatchNormalizationKernel *get_implementation(const BatchNormalizationSelectorData &data)
{
- for(const auto &uk : available_kernels)
+ for (const auto &uk : available_kernels)
{
- if(uk.is_selected(data))
+ if (uk.is_selected(data))
{
return &uk;
}
@@ -104,25 +101,31 @@ const BatchNormalizationKernel *get_implementation(const BatchNormalizationSelec
return nullptr;
}
-Status
-validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var,
- const ITensorInfo *beta, const ITensorInfo *gamma, float epsilon, ActivationLayerInfo act_info)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *var,
+ const ITensorInfo *beta,
+ const ITensorInfo *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
ARM_COMPUTE_UNUSED(epsilon);
- const auto *uk = get_implementation(BatchNormalizationSelectorData{ input->data_type(), CPUInfo::get() });
+ const auto *uk = get_implementation(BatchNormalizationSelectorData{input->data_type(), CPUInfo::get()});
ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
- if(act_info.enabled())
+ if (act_info.enabled())
{
ActivationLayerInfo::ActivationFunction act = act_info.activation();
- ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
- && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
- && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+ ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU &&
+ act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+ act !=
+ ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a());
}
- if(nullptr != output)
+ if (nullptr != output)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -131,17 +134,18 @@ validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const IT
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
- if(beta != nullptr)
+ if (beta != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
}
- if(gamma != nullptr)
+ if (gamma != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
}
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
+ ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(
+ input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
return Status{};
}
@@ -169,10 +173,12 @@ void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &win
// Only compute denominator and constants once per feature map.
int slice = -1;
- const auto input_mean = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0)));
- const auto input_var = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
- const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_mean = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0)));
+ const auto input_var = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma =
+ (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_beta =
+ (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
T mean = static_cast<T>(0);
T var = static_cast<T>(0);
@@ -186,80 +192,83 @@ void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &win
auto beta_vec = wrapper::vdup_n(beta, ExactTagType{});
auto denominator_vec = wrapper::vdup_n(denominator, ExactTagType{});
const auto epsilon_vec = wrapper::vdup_n(static_cast<T>(_epsilon), ExactTagType{});
- execute_window_loop(win_to_use, [&](const Coordinates & id)
- {
- const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
- const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
- if(slice != id.z())
+ execute_window_loop(
+ win_to_use,
+ [&](const Coordinates &id)
{
- mean = input_mean[id.z()];
- var = input_var[id.z()];
- mean_vec = wrapper::vdup_n(mean, ExactTagType{});
- var_vec = wrapper::vdup_n(var, ExactTagType{});
- if(input_gamma != nullptr)
- {
- gamma = input_gamma[id.z()];
- gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
- }
- if(input_beta != nullptr)
+ const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+ if (slice != id.z())
{
- beta = input_beta[id.z()];
- beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+ mean = input_mean[id.z()];
+ var = input_var[id.z()];
+ mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+ var_vec = wrapper::vdup_n(var, ExactTagType{});
+ if (input_gamma != nullptr)
+ {
+ gamma = input_gamma[id.z()];
+ gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
+ }
+ if (input_beta != nullptr)
+ {
+ beta = input_beta[id.z()];
+ beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+ }
+
+ // Calculate denominator
+ denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+ denominator = wrapper::vgetlane(denominator_vec, 0);
+ slice = id.z();
}
- // Calculate denominator
- denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
- denominator = wrapper::vgetlane(denominator_vec, 0);
- slice = id.z();
- }
-
- // Perform core calculations using vector operations
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- // Calculate x bar
- const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
- const auto x_bar = wrapper::vmul(numerator, denominator_vec);
- auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec);
-
- // Perform fused activation
- if(fused_activation)
+ // Perform core calculations using vector operations
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- activation_functor(res);
+ // Calculate x bar
+ const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+ const auto x_bar = wrapper::vmul(numerator, denominator_vec);
+ auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+ // Perform fused activation
+ if (fused_activation)
+ {
+ activation_functor(res);
+ }
+
+ // Store results
+ wrapper::vstore(output_ptr + x, res);
}
- // Store results
- wrapper::vstore(output_ptr + x, res);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const T numerator = input_ptr[x] - mean;
- const T x_bar = numerator * denominator;
- T res = beta + x_bar * gamma;
-
- // Perform fused activation
- if(fused_activation)
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- activation_functor(res);
+ const T numerator = input_ptr[x] - mean;
+ const T x_bar = numerator * denominator;
+ T res = beta + x_bar * gamma;
+
+ // Perform fused activation
+ if (fused_activation)
+ {
+ activation_functor(res);
+ }
+
+ // Store results
+ *(output_ptr + x) = res;
}
-
- // Store results
- *(output_ptr + x) = res;
- }
- },
- input, output);
+ },
+ input, output);
}
void NEBatchNormalizationLayerKernel::configure_non_fused()
{
- switch(_input->info()->data_type())
+ switch (_input->info()->data_type())
{
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, false, detail::dummy<float16_t, 8>>;
+ _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, false,
+ detail::dummy<float16_t, 8>>;
break;
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
@@ -274,23 +283,25 @@ void NEBatchNormalizationLayerKernel::configure_non_fused()
void NEBatchNormalizationLayerKernel::configure_fused()
{
// NCHW Fused Batched Normalization with activation functions : FP32
- static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw =
- {
- { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::relu<float, 4>> },
- { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::brelu<float, 4>> },
- { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::lubrelu<float, 4>> }
- };
+ static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw = {
+ {ActivationLayerInfo::ActivationFunction::RELU,
+ &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::relu<float, 4>>},
+ {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::brelu<float, 4>>},
+ {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::lubrelu<float, 4>>}};
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
// NCHW Fused Batched Normalization with activation functions : FP16
- static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw =
- {
- { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>> },
- { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>> },
- { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>> }
- };
+ static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw = {
+ {ActivationLayerInfo::ActivationFunction::RELU,
+ &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>>},
+ {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>>},
+ {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>>}};
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- switch(_input->info()->data_type())
+ switch (_input->info()->data_type())
{
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
@@ -307,22 +318,32 @@ void NEBatchNormalizationLayerKernel::configure_fused()
}
NEBatchNormalizationLayerKernel::NEBatchNormalizationLayerKernel()
- : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(), _act_info()
+ : _func(nullptr),
+ _input(nullptr),
+ _output(nullptr),
+ _mean(nullptr),
+ _var(nullptr),
+ _gamma(nullptr),
+ _beta(nullptr),
+ _epsilon(),
+ _act_info()
{
}
-void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output,
- const ITensor *mean, const ITensor *var,
- const ITensor *beta, const ITensor *gamma,
- float epsilon, ActivationLayerInfo act_info)
+void NEBatchNormalizationLayerKernel::configure(ITensor *input,
+ ITensor *output,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta,
+ const ITensor *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr,
- mean->info(), var->info(),
- (beta != nullptr) ? beta->info() : nullptr,
- (gamma != nullptr) ? gamma->info() : nullptr,
- epsilon, act_info));
+ mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr,
+ (gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info));
_input = input;
_output = input;
@@ -334,16 +355,16 @@ void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output,
_act_info = act_info;
const bool run_in_place = (output == nullptr) || (output == input);
- if(!run_in_place)
+ if (!run_in_place)
{
_output = output;
}
// Configure activation function to run
const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW;
- if(is_nchw)
+ if (is_nchw)
{
- if(_act_info.enabled())
+ if (_act_info.enabled())
{
configure_fused();
}
@@ -357,17 +378,21 @@ void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output,
Window win = calculate_max_window(*input->info(), Steps());
INEKernel::configure(win);
- if(output != nullptr)
+ if (output != nullptr)
{
// Output auto initialization if not yet initialized
auto_init_if_empty(*output->info(), *input->info()->clone());
}
}
-Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *mean, const ITensorInfo *var,
- const ITensorInfo *beta, const ITensorInfo *gamma,
- float epsilon, ActivationLayerInfo act_info)
+Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *var,
+ const ITensorInfo *beta,
+ const ITensorInfo *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
@@ -382,13 +407,14 @@ void NEBatchNormalizationLayerKernel::run(const Window &window, const ThreadInfo
ARM_COMPUTE_ERROR_ON(_func == nullptr && _input->info()->data_layout() == DataLayout::NCHW);
const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW;
- if(is_nchw)
+ if (is_nchw)
{
(this->*_func)(window);
}
else
{
- const auto *uk = get_implementation(BatchNormalizationSelectorData{ _input->info()->data_type(), CPUInfo::get() });
+ const auto *uk =
+ get_implementation(BatchNormalizationSelectorData{_input->info()->data_type(), CPUInfo::get()});
uk->ukernel(_input, _output, _mean, _var, _beta, _gamma, _epsilon, _act_info, window);
}
}
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
index 0551ace30c..2e8ff0dc9a 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
@@ -68,7 +69,13 @@ public:
* @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f.
* @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
*/
- void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta = nullptr, const ITensor *gamma = nullptr, float epsilon = 0.001f,
+ void configure(ITensor *input,
+ ITensor *output,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta = nullptr,
+ const ITensor *gamma = nullptr,
+ float epsilon = 0.001f,
ActivationLayerInfo act_info = ActivationLayerInfo());
/** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayerKernel
*
@@ -85,10 +92,14 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- const ITensorInfo *mean, const ITensorInfo *var,
- const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
- float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *var,
+ const ITensorInfo *beta = nullptr,
+ const ITensorInfo *gamma = nullptr,
+ float epsilon = 0.001f,
+ ActivationLayerInfo act_info = ActivationLayerInfo());
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
index 83fb5f6f51..f299bb94a4 100644
--- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
@@ -27,8 +27,9 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -46,7 +47,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -54,7 +55,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
return Status{};
}
-Status validate_arguments_static(const ITensorInfo *input, int block_shape_x, int block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status validate_arguments_static(const ITensorInfo *input,
+ int block_shape_x,
+ int block_shape_y,
+ const ITensorInfo *output,
+ const CropInfo &crop_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
@@ -65,13 +70,14 @@ Status validate_arguments_static(const ITensorInfo *input, int block_shape_x, in
const int idx_batch = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- const TensorShape expected_output_shape = compute_batch_to_space_shape(input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
- const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
+ const TensorShape expected_output_shape = compute_batch_to_space_shape(
+ input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
+ const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output);
}
@@ -80,7 +86,13 @@ Status validate_arguments_static(const ITensorInfo *input, int block_shape_x, in
} // namespace
NEBatchToSpaceLayerKernel::NEBatchToSpaceLayerKernel()
- : _input(nullptr), _block_shape(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _block_shape_x(), _block_shape_y(), _crop_info()
+ : _input(nullptr),
+ _block_shape(nullptr),
+ _output(nullptr),
+ _data_layout(DataLayout::UNKNOWN),
+ _block_shape_x(),
+ _block_shape_y(),
+ _crop_info()
{
}
@@ -99,15 +111,18 @@ void NEBatchToSpaceLayerKernel::configure(const ITensor *input, const ITensor *b
ICPPKernel::configure(win);
}
-void NEBatchToSpaceLayerKernel::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info)
+void NEBatchToSpaceLayerKernel::configure(
+ const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- const TensorShape output_shape = compute_batch_to_space_shape(input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
+ const TensorShape output_shape = compute_batch_to_space_shape(
+ input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
// Output auto initialization if not yet initialized
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
// Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
_input = input;
_output = output;
@@ -121,14 +136,19 @@ void NEBatchToSpaceLayerKernel::configure(const ITensor *input, int32_t block_sh
ICPPKernel::configure(win);
}
-Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output));
return Status{};
}
-Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input,
+ int32_t block_shape_x,
+ int32_t block_shape_y,
+ const ITensorInfo *output,
+ const CropInfo &crop_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output, crop_info));
@@ -141,7 +161,7 @@ void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
- if(_block_shape != nullptr)
+ if (_block_shape != nullptr)
{
// Retrieve the block shapes dynamically
_block_shape_x = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(0)));
@@ -155,31 +175,32 @@ void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
int batch_id = 0;
// Main loop for NCHW and NHWC
- if(_data_layout == DataLayout::NCHW)
+ if (_data_layout == DataLayout::NCHW)
{
do
{
Iterator out(_output, slice_out);
- execute_window_loop(slice_out, [&](const Coordinates & id)
- {
-
- const int x = id.x();
- const int y = id.y();
- const int z = id.z();
- // Translate x, y to uncropped version
- const int x_c = x + _crop_info.left;
- const int y_c = y + _crop_info.top;
-
- const int in_batch = batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size;
- const int in_x = x_c / _block_shape_x;
- const int in_y = y_c / _block_shape_y;
- Coordinates input_coords{ in_x, in_y, z, in_batch };
- memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
- },
- out);
+ execute_window_loop(
+ slice_out,
+ [&](const Coordinates &id)
+ {
+ const int x = id.x();
+ const int y = id.y();
+ const int z = id.z();
+ // Translate x, y to uncropped version
+ const int x_c = x + _crop_info.left;
+ const int y_c = y + _crop_info.top;
+
+ const int in_batch =
+ batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size;
+ const int in_x = x_c / _block_shape_x;
+ const int in_y = y_c / _block_shape_y;
+ Coordinates input_coords{in_x, in_y, z, in_batch};
+ memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+ },
+ out);
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_out));
+ } while (window.slide_window_slice_3D(slice_out));
}
else
{
@@ -188,26 +209,28 @@ void NEBatchToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
do
{
Iterator out(_output, slice_out);
- execute_window_loop(slice_out, [&](const Coordinates & id)
- {
-
- const int x = id.y();
- const int y = id.z();
-
- // Translate x, y to uncropped version
- const int x_c = x + _crop_info.left;
- const int y_c = y + _crop_info.top;
-
- const int in_batch = batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size;
- const int in_x = x_c / _block_shape_x;
- const int in_y = y_c / _block_shape_y;
- Coordinates input_coords{ 0, in_x, in_y, in_batch };
- memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size * _input->info()->dimension(0));
- },
- out);
+ execute_window_loop(
+ slice_out,
+ [&](const Coordinates &id)
+ {
+ const int x = id.y();
+ const int y = id.z();
+
+ // Translate x, y to uncropped version
+ const int x_c = x + _crop_info.left;
+ const int y_c = y + _crop_info.top;
+
+ const int in_batch =
+ batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size;
+ const int in_x = x_c / _block_shape_x;
+ const int in_y = y_c / _block_shape_y;
+ Coordinates input_coords{0, in_x, in_y, in_batch};
+ memcpy(out.ptr(), _input->ptr_to_element(input_coords),
+ element_size * _input->info()->dimension(0));
+ },
+ out);
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_out));
+ } while (window.slide_window_slice_3D(slice_out));
}
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
index 5eceee0904..d98ac621b0 100644
--- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
+++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NEBATCHTOSPACELAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
@@ -68,7 +69,11 @@ public:
* @param[out] output Tensor output. Data types supported: same as @p input
* @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed
*/
- void configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info = CropInfo{});
+ void configure(const ITensor *input,
+ int32_t block_shape_x,
+ int32_t block_shape_y,
+ ITensor *output,
+ const CropInfo &crop_info = CropInfo{});
/** Static function to check if given info will lead to a valid configuration of @ref NEBatchToSpaceLayerKernel
*
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -90,7 +95,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info = CropInfo{});
+ static Status validate(const ITensorInfo *input,
+ int32_t block_shape_x,
+ int32_t block_shape_y,
+ const ITensorInfo *output,
+ const CropInfo &crop_info = CropInfo{});
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
index 677c5cddcc..a59bbd233b 100644
--- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
@@ -27,9 +27,10 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
#include <cstdint>
@@ -55,8 +56,7 @@ inline void bitwise_and(const T *__restrict input1, const T *__restrict input2,
}
} // namespace
-NEBitwiseAndKernel::NEBitwiseAndKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
+NEBitwiseAndKernel::NEBitwiseAndKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
{
}
@@ -86,8 +86,7 @@ void NEBitwiseAndKernel::configure(const ITensor *input1, const ITensor *input2,
Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- update_window_and_padding(win,
- AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+ update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
output_access);
@@ -103,9 +102,7 @@ void NEBitwiseAndKernel::run(const Window &window, const ThreadInfo &info)
Iterator input2(_input2, window);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates &)
- {
- bitwise_and<uint8_t>(input1.ptr(), input2.ptr(), output.ptr());
- },
- input1, input2, output);
+ execute_window_loop(
+ window, [&](const Coordinates &) { bitwise_and<uint8_t>(input1.ptr(), input2.ptr(), output.ptr()); }, input1,
+ input2, output);
}
diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
index 19b1af690a..ecd181a7af 100644
--- a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -50,8 +51,7 @@ inline void bitwise_not_U8_U8(const uint8_t *__restrict input, uint8_t *__restri
}
} // namespace
-NEBitwiseNotKernel::NEBitwiseNotKernel()
- : _input(nullptr), _output(nullptr)
+NEBitwiseNotKernel::NEBitwiseNotKernel() : _input(nullptr), _output(nullptr)
{
}
@@ -77,7 +77,8 @@ void NEBitwiseNotKernel::configure(const ITensor *input, ITensor *output)
// Configure kernel window
Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), output_access);
+ update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+ output_access);
INEKernel::configure(win);
}
@@ -90,9 +91,6 @@ void NEBitwiseNotKernel::run(const Window &window, const ThreadInfo &info)
Iterator input(_input, window);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates &)
- {
- bitwise_not_U8_U8(input.ptr(), output.ptr());
- },
- input, output);
+ execute_window_loop(
+ window, [&](const Coordinates &) { bitwise_not_U8_U8(input.ptr(), output.ptr()); }, input, output);
}
diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
index 08094fbfcf..4c906134aa 100644
--- a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -42,7 +43,8 @@ class Coordinates;
namespace
{
-inline void bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
+inline void
+bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
{
const uint8x16_t val1 = vld1q_u8(input1);
const uint8x16_t val2 = vld1q_u8(input2);
@@ -51,8 +53,7 @@ inline void bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t
}
} // namespace
-NEBitwiseOrKernel::NEBitwiseOrKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
+NEBitwiseOrKernel::NEBitwiseOrKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
{
}
@@ -82,8 +83,7 @@ void NEBitwiseOrKernel::configure(const ITensor *input1, const ITensor *input2,
Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- update_window_and_padding(win,
- AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+ update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
output_access);
@@ -99,9 +99,7 @@ void NEBitwiseOrKernel::run(const Window &window, const ThreadInfo &info)
Iterator input2(_input2, window);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates &)
- {
- bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
- },
- input1, input2, output);
+ execute_window_loop(
+ window, [&](const Coordinates &) { bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); }, input1,
+ input2, output);
}
diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
index fc5b38b64f..dbbed2483c 100644
--- a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -42,7 +43,8 @@ class Coordinates;
namespace
{
-inline void bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
+inline void
+bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
{
const uint8x16_t val1 = vld1q_u8(input1);
const uint8x16_t val2 = vld1q_u8(input2);
@@ -51,8 +53,7 @@ inline void bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t
}
} // namespace
-NEBitwiseXorKernel::NEBitwiseXorKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr)
+NEBitwiseXorKernel::NEBitwiseXorKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
{
}
@@ -82,7 +83,8 @@ void NEBitwiseXorKernel::configure(const ITensor *input1, const ITensor *input2,
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
- AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), output_access);
+ AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
+ output_access);
INEKernel::configure(win);
}
@@ -96,9 +98,7 @@ void NEBitwiseXorKernel::run(const Window &window, const ThreadInfo &info)
Iterator input2(_input2, window);
Iterator output(_output, window);
- execute_window_loop(window, [&](const Coordinates &)
- {
- bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
- },
- input1, input2, output);
+ execute_window_loop(
+ window, [&](const Coordinates &) { bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); }, input1,
+ input2, output);
}
diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
index 69bfd56ce0..cb869838e2 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
@@ -27,8 +27,9 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/cpu/kernels/boundingboxtransform/list.h"
@@ -45,7 +46,11 @@ struct BoundingBoxTransformSelectorData
};
using BoundingBoxTransformSelctorPtr = std::add_pointer<bool(const BoundingBoxTransformSelectorData &data)>::type;
-using BoundingBoxTransformUKernelPtr = std::add_pointer<void(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)>::type;
+using BoundingBoxTransformUKernelPtr = std::add_pointer<void(const ITensor *boxes,
+ ITensor *pred_boxes,
+ const ITensor *deltas,
+ BoundingBoxTransformInfo bbinfo,
+ const Window &window)>::type;
struct BoundingBoxTransformKernel
{
@@ -54,26 +59,19 @@ struct BoundingBoxTransformKernel
BoundingBoxTransformUKernelPtr ukernel;
};
-static const BoundingBoxTransformKernel available_kernels[] =
-{
- {
- "fp32_neon_boundingboxtransform",
- [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform)
- },
+static const BoundingBoxTransformKernel available_kernels[] = {
+ {"fp32_neon_boundingboxtransform",
+ [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform)},
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- {
- "fp16_neon_boundingboxtransform",
- [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F16; },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform)
- },
+ {"fp16_neon_boundingboxtransform",
+ [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform)},
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#if defined(ARM_COMPUTE_ENABLE_NEON)
- {
- "qu16_neon_boundingboxtransform",
- [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::QASYMM16; },
- REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_boundingboxtransform)
- },
+ {"qu16_neon_boundingboxtransform",
+ [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::QASYMM16; },
+ REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_boundingboxtransform)},
#endif //defined(ARM_COMPUTE_ENABLE_NEON)
};
@@ -85,9 +83,9 @@ static const BoundingBoxTransformKernel available_kernels[] =
*/
const BoundingBoxTransformKernel *get_implementation(const BoundingBoxTransformSelectorData &data)
{
- for(const auto &uk : available_kernels)
+ for (const auto &uk : available_kernels)
{
- if(uk.is_selected(data))
+ if (uk.is_selected(data))
{
return &uk;
}
@@ -95,7 +93,10 @@ const BoundingBoxTransformKernel *get_implementation(const BoundingBoxTransformS
return nullptr;
}
-Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status validate_arguments(const ITensorInfo *boxes,
+ const ITensorInfo *pred_boxes,
+ const ITensorInfo *deltas,
+ const BoundingBoxTransformInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(boxes);
@@ -108,7 +109,7 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(info.scale() <= 0);
- if(boxes->data_type() == DataType::QASYMM16)
+ if (boxes->data_type() == DataType::QASYMM16)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(deltas, 1, DataType::QASYMM8);
const UniformQuantizationInfo deltas_qinfo = deltas->quantization_info().uniform();
@@ -120,12 +121,12 @@ Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxe
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes, deltas);
}
- if(pred_boxes->total_size() > 0)
+ if (pred_boxes->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, deltas);
ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2);
- if(pred_boxes->data_type() == DataType::QASYMM16)
+ if (pred_boxes->data_type() == DataType::QASYMM16)
{
const UniformQuantizationInfo pred_qinfo = pred_boxes->quantization_info().uniform();
ARM_COMPUTE_RETURN_ERROR_ON(pred_qinfo.scale != 0.125f);
@@ -142,13 +143,19 @@ NEBoundingBoxTransformKernel::NEBoundingBoxTransformKernel()
{
}
-void NEBoundingBoxTransformKernel::configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info)
+void NEBoundingBoxTransformKernel::configure(const ITensor *boxes,
+ ITensor *pred_boxes,
+ const ITensor *deltas,
+ const BoundingBoxTransformInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info));
// Configure kernel window
- auto_init_if_empty(*pred_boxes->info(), deltas->info()->clone()->set_data_type(boxes->info()->data_type()).set_quantization_info(boxes->info()->quantization_info()));
+ auto_init_if_empty(*pred_boxes->info(), deltas->info()
+ ->clone()
+ ->set_data_type(boxes->info()->data_type())
+ .set_quantization_info(boxes->info()->quantization_info()));
// Set instance variables
_boxes = boxes;
@@ -164,7 +171,10 @@ void NEBoundingBoxTransformKernel::configure(const ITensor *boxes, ITensor *pred
INEKernel::configure(win);
}
-Status NEBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status NEBoundingBoxTransformKernel::validate(const ITensorInfo *boxes,
+ const ITensorInfo *pred_boxes,
+ const ITensorInfo *deltas,
+ const BoundingBoxTransformInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(boxes, pred_boxes, deltas, info));
return Status{};
@@ -176,7 +186,7 @@ void NEBoundingBoxTransformKernel::run(const Window &window, const ThreadInfo &i
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- const auto *uk = get_implementation(BoundingBoxTransformSelectorData{ _boxes->info()->data_type() });
+ const auto *uk = get_implementation(BoundingBoxTransformSelectorData{_boxes->info()->data_type()});
ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
uk->ukernel(_boxes, _pred_boxes, _deltas, _bbinfo, window);
diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
index def827836c..3915994feb 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
@@ -63,7 +63,8 @@ public:
* @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
*
*/
- void configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
+ void
+ configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
/** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
*
@@ -77,7 +78,10 @@ public:
*
* @return a Status
*/
- static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+ static Status validate(const ITensorInfo *boxes,
+ const ITensorInfo *pred_boxes,
+ const ITensorInfo *deltas,
+ const BoundingBoxTransformInfo &info);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
index 64da1f2262..3b53b7055f 100644
--- a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
@@ -30,6 +30,7 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -44,15 +45,19 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NCHW, DataLayout::NHWC);
- const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
+ const unsigned int channels =
+ input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+ num_groups == channels,
+ "Channel shuffling with same number of groups as number of channels would be inefficient");
ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels); // There cannot be more groups than channels
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0,
+ "The number of channels must be a multiple of the number of groups");
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -72,20 +77,22 @@ void channel_shuffle_nhwc(const ITensor *input, ITensor *output, unsigned int nu
Iterator in(input, window);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- // Shuffle channel
- const unsigned int curr_channel = id.x();
- const unsigned int group_id = curr_channel * rK;
- const unsigned int r = group_id * K;
- const unsigned int channel_id = curr_channel - r;
-
- // Calculate output coordinates
- Coordinates out_coords = id;
- out_coords.set(Window::DimX, channel_id * num_groups + group_id);
- std::copy_n(in.ptr(), element_size, output->ptr_to_element(out_coords));
- },
- in);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ // Shuffle channel
+ const unsigned int curr_channel = id.x();
+ const unsigned int group_id = curr_channel * rK;
+ const unsigned int r = group_id * K;
+ const unsigned int channel_id = curr_channel - r;
+
+ // Calculate output coordinates
+ Coordinates out_coords = id;
+ out_coords.set(Window::DimX, channel_id * num_groups + group_id);
+ std::copy_n(in.ptr(), element_size, output->ptr_to_element(out_coords));
+ },
+ in);
}
void channel_shuffle_nchw(const ITensor *input, ITensor *output, unsigned int num_groups, const Window &window)
{
@@ -107,34 +114,35 @@ void channel_shuffle_nchw(const ITensor *input, ITensor *output, unsigned int nu
Iterator in(input, win);
- execute_window_loop(win, [&](const Coordinates & id)
- {
- // Shuffle channel
- const unsigned int curr_channel = id.z();
- const unsigned int group_id = curr_channel * rK;
- const unsigned int r = group_id * K;
- const unsigned int channel_id = curr_channel - r;
-
- // Calculate output coordinates
- Coordinates out_coords = id;
- out_coords.set(Window::DimZ, channel_id * num_groups + group_id);
- const uint8_t *input_ptr = in.ptr();
- uint8_t *output_ptr = output->ptr_to_element(out_coords);
-
- // Copy plane
- for(unsigned int y = 0; y < height; ++y)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- std::copy_n(input_ptr, row_size, output_ptr);
- input_ptr += input_stride_y;
- output_ptr += output_stride_y;
- }
- },
- in);
+ // Shuffle channel
+ const unsigned int curr_channel = id.z();
+ const unsigned int group_id = curr_channel * rK;
+ const unsigned int r = group_id * K;
+ const unsigned int channel_id = curr_channel - r;
+
+ // Calculate output coordinates
+ Coordinates out_coords = id;
+ out_coords.set(Window::DimZ, channel_id * num_groups + group_id);
+ const uint8_t *input_ptr = in.ptr();
+ uint8_t *output_ptr = output->ptr_to_element(out_coords);
+
+ // Copy plane
+ for (unsigned int y = 0; y < height; ++y)
+ {
+ std::copy_n(input_ptr, row_size, output_ptr);
+ input_ptr += input_stride_y;
+ output_ptr += output_stride_y;
+ }
+ },
+ in);
}
} // namespace
-NEChannelShuffleLayerKernel::NEChannelShuffleLayerKernel()
- : _input(nullptr), _output(nullptr), _num_groups()
+NEChannelShuffleLayerKernel::NEChannelShuffleLayerKernel() : _input(nullptr), _output(nullptr), _num_groups()
{
}
@@ -158,7 +166,8 @@ void NEChannelShuffleLayerKernel::configure(const ITensor *input, ITensor *outpu
INEKernel::configure(win);
}
-Status NEChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+Status
+NEChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups));
return Status{};
@@ -170,7 +179,7 @@ void NEChannelShuffleLayerKernel::run(const Window &window, const ThreadInfo &in
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- switch(_input->info()->data_layout())
+ switch (_input->info()->data_layout())
{
case DataLayout::NHWC:
channel_shuffle_nhwc(_input, _output, _num_groups, window);
diff --git a/src/core/NEON/kernels/NECol2ImKernel.h b/src/core/NEON/kernels/NECol2ImKernel.h
index 1976302036..bc6652fd30 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.h
+++ b/src/core/NEON/kernels/NECol2ImKernel.h
@@ -24,10 +24,10 @@
#ifndef ARM_COMPUTE_NECOL2IMKERNEL_H
#define ARM_COMPUTE_NECOL2IMKERNEL_H
-#include "src/core/NEON/INEKernel.h"
-
#include "arm_compute/core/Size2D.h"
+#include "src/core/NEON/INEKernel.h"
+
namespace arm_compute
{
class ITensor;
diff --git a/src/core/NEON/kernels/NECropKernel.cpp b/src/core/NEON/kernels/NECropKernel.cpp
index 94c455305c..60271fbc74 100644
--- a/src/core/NEON/kernels/NECropKernel.cpp
+++ b/src/core/NEON/kernels/NECropKernel.cpp
@@ -26,14 +26,15 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
#include "arm_compute/core/utils/helpers/tensor_transform.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Window.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include "src/core/utils/helpers/bit_ops.h"
#include "src/cpu/kernels/crop/list.h"
@@ -47,7 +48,8 @@ struct CropSelectorData
};
using CropSelectorPtr = std::add_pointer<bool(const CropSelectorData &data)>::type;
-using CropUKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool)>::type;
+using CropUKernelPtr = std::add_pointer<void(
+ const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool)>::type;
struct CropUKernel
{
@@ -56,48 +58,23 @@ struct CropUKernel
CropUKernelPtr ukernel;
};
-static const CropUKernel available_kernels[] =
-{
- {
- "fp16_neon_crop",
- [](const CropSelectorData & data) { return data.dt == DataType::F16; },
- REGISTER_FP16_NEON(arm_compute::cpu::fp16_in_bounds_crop_window)
- },
- {
- "f32_neon_crop",
- [](const CropSelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::fp32_in_bounds_crop_window)
- },
- {
- "u8_neon_crop",
- [](const CropSelectorData & data) { return data.dt == DataType::U8; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::u8_in_bounds_crop_window)
- },
- {
- "u16_neon_crop",
- [](const CropSelectorData & data) { return data.dt == DataType::U16; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::u16_in_bounds_crop_window)
- },
- {
- "u32_neon_crop",
- [](const CropSelectorData & data) { return data.dt == DataType::U32; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::u32_in_bounds_crop_window)
- },
- {
- "s8_neon_crop",
- [](const CropSelectorData & data) { return data.dt == DataType::S8; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::s8_in_bounds_crop_window)
- },
- {
- "s16_neon_crop",
- [](const CropSelectorData & data) { return data.dt == DataType::S16; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::s16_in_bounds_crop_window)
- },
- {
- "s32_neon_crop",
- [](const CropSelectorData & data) { return data.dt == DataType::S32; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::s32_in_bounds_crop_window)
- },
+static const CropUKernel available_kernels[] = {
+ {"fp16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::fp16_in_bounds_crop_window)},
+ {"f32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::fp32_in_bounds_crop_window)},
+ {"u8_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U8; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::u8_in_bounds_crop_window)},
+ {"u16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U16; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::u16_in_bounds_crop_window)},
+ {"u32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U32; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::u32_in_bounds_crop_window)},
+ {"s8_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S8; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::s8_in_bounds_crop_window)},
+ {"s16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S16; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::s16_in_bounds_crop_window)},
+ {"s32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S32; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::s32_in_bounds_crop_window)},
};
/** Micro-kernel selector
@@ -108,9 +85,9 @@ static const CropUKernel available_kernels[] =
*/
const CropUKernel *get_implementation(const CropSelectorData &data)
{
- for(const auto &uk : available_kernels)
+ for (const auto &uk : available_kernels)
{
- if(uk.is_selected(data))
+ if (uk.is_selected(data))
{
return &uk;
}
@@ -119,26 +96,40 @@ const CropUKernel *get_implementation(const CropSelectorData &data)
return nullptr;
}
-inline void out_of_bounds_crop_window(const ITensor *output, float *output_ptr, float extrapolation_value,
- int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit)
+inline void out_of_bounds_crop_window(const ITensor *output,
+ float *output_ptr,
+ float extrapolation_value,
+ int32_t window_step_x,
+ int32_t output_width_start,
+ int32_t output_width_limit)
{
- auto in = wrapper::vdup_n(extrapolation_value, wrapper::traits::vector_128_tag());
- int32_t x = 0;
- int32_t limit = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
- float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
- for(; x <= limit - window_step_x; x += window_step_x)
+ auto in = wrapper::vdup_n(extrapolation_value, wrapper::traits::vector_128_tag());
+ int32_t x = 0;
+ int32_t limit = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
+ float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
+ for (; x <= limit - window_step_x; x += window_step_x)
{
wrapper::vstore(output_start_ptr + x, in);
}
- for(; x < limit; ++x)
+ for (; x < limit; ++x)
{
*(output_start_ptr + x) = extrapolation_value;
}
}
-inline void execute_window(const ITensor *input, const ITensor *output, Coordinates input_offset, float extrapolation_value,
- const std::array<uint32_t, 2> &rows_out_of_bounds, const std::array<uint32_t, 2> &cols_out_of_bounds, NECropKernel::InBoundsCropFunction *in_bounds_crop_function,
- bool is_height_flipped, bool has_cols_in_bounds, bool has_cols_out_of_bounds_before, bool has_cols_out_of_bounds_after, bool input_has_single_channel, bool is_width_flipped)
+inline void execute_window(const ITensor *input,
+ const ITensor *output,
+ Coordinates input_offset,
+ float extrapolation_value,
+ const std::array<uint32_t, 2> &rows_out_of_bounds,
+ const std::array<uint32_t, 2> &cols_out_of_bounds,
+ NECropKernel::InBoundsCropFunction *in_bounds_crop_function,
+ bool is_height_flipped,
+ bool has_cols_in_bounds,
+ bool has_cols_out_of_bounds_before,
+ bool has_cols_out_of_bounds_after,
+ bool input_has_single_channel,
+ bool is_width_flipped)
{
// Output is always float.
const int window_step_x = 16 / sizeof(float);
@@ -159,45 +150,66 @@ inline void execute_window(const ITensor *input, const ITensor *output, Coordina
// |------------------------------|
// Fill all output rows that have no elements that are within the input bounds with the extrapolation value.
// First for the rows before the in bounds rows.
- out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[0] * output->info()->dimension(1));
+ out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0,
+ rows_out_of_bounds[0] * output->info()->dimension(1));
output_ptr += rows_out_of_bounds[0] * output->info()->dimension(1) * output->info()->dimension(0);
// Iterate through each row that has any elements within the input bounds.
- for(uint32_t row = rows_out_of_bounds[0]; static_cast<int32_t>(row) < static_cast<int32_t>(output->info()->dimension(2) - rows_out_of_bounds[1]);
- ++row, is_height_flipped ? --input_offset[2] : ++input_offset[2])
+ for (uint32_t row = rows_out_of_bounds[0];
+ static_cast<int32_t>(row) < static_cast<int32_t>(output->info()->dimension(2) - rows_out_of_bounds[1]);
+ ++row, is_height_flipped ? --input_offset[2] : ++input_offset[2])
{
// Fill all elements in the row that are out of bounds with the extrapolation value.
// First for the elements before the in bounds elements.
- if(has_cols_out_of_bounds_before)
+ if (has_cols_out_of_bounds_before)
{
out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, cols_out_of_bounds[0]);
}
// Copy all elements within the input bounds from the input tensor.
- if(has_cols_in_bounds)
+ if (has_cols_in_bounds)
{
(*in_bounds_crop_function)(input, output, output_ptr, input_offset, window_step_x, cols_out_of_bounds[0],
- output->info()->dimension(1) - cols_out_of_bounds[1], input_has_single_channel, is_width_flipped);
+ output->info()->dimension(1) - cols_out_of_bounds[1], input_has_single_channel,
+ is_width_flipped);
}
// Fill all elements after the in bounds elements with the extrapolation value.
- if(has_cols_out_of_bounds_after)
+ if (has_cols_out_of_bounds_after)
{
- out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, output->info()->dimension(1) - cols_out_of_bounds[1], output->info()->dimension(1));
+ out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x,
+ output->info()->dimension(1) - cols_out_of_bounds[1],
+ output->info()->dimension(1));
}
output_ptr += output->info()->dimension(1) * output->info()->dimension(0);
}
// Fill all rows after the in bounds elements with the extrapolation value.
- out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[1] * output->info()->dimension(1));
+ out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0,
+ rows_out_of_bounds[1] * output->info()->dimension(1));
}
} // namespace
NECropKernel::NECropKernel()
- : _input(nullptr), _crop_boxes(nullptr), _box_ind(nullptr), _output(nullptr), _start(), _end(), _crop_box_ind(0), _extrapolation_value(0), _rows_out_of_bounds(), _cols_out_of_bounds()
+ : _input(nullptr),
+ _crop_boxes(nullptr),
+ _box_ind(nullptr),
+ _output(nullptr),
+ _start(),
+ _end(),
+ _crop_box_ind(0),
+ _extrapolation_value(0),
+ _rows_out_of_bounds(),
+ _cols_out_of_bounds()
{
}
-void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind, float extrapolation_value)
+void NECropKernel::configure(const ITensor *input,
+ const ITensor *crop_boxes,
+ const ITensor *box_ind,
+ ITensor *output,
+ uint32_t crop_box_ind,
+ float extrapolation_value)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), crop_boxes->info(), box_ind->info(), output->info(), crop_box_ind, extrapolation_value));
+ ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), crop_boxes->info(), box_ind->info(), output->info(),
+ crop_box_ind, extrapolation_value));
_input = input;
_crop_boxes = crop_boxes;
@@ -207,21 +219,27 @@ void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, co
_extrapolation_value = extrapolation_value;
}
-Status NECropKernel::validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind, float extrapolation_value)
+Status NECropKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *crop_boxes,
+ const ITensorInfo *box_ind,
+ const ITensorInfo *output,
+ uint32_t crop_box_ind,
+ float extrapolation_value)
{
ARM_COMPUTE_UNUSED(extrapolation_value);
- const auto *uk = get_implementation(CropSelectorData{ input->data_type() });
+ const auto *uk = get_implementation(CropSelectorData{input->data_type()});
ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16,
+ DataType::F16, DataType::U32, DataType::S32, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[0] != 4);
ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]);
ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[1] <= crop_box_ind);
ARM_COMPUTE_RETURN_ERROR_ON(box_ind->tensor_shape()[0] <= crop_box_ind);
- if(output->total_size() > 0)
+ if (output->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -242,48 +260,53 @@ void NECropKernel::configure_output_shape()
// The normalized coordiantes are scaled to retrieve the floating point image coordinates which are rounded to integers.
_start = Coordinates(std::floor(x0 * (_input->info()->tensor_shape()[1] - 1) + 0.5f),
std::floor(y0 * (_input->info()->tensor_shape()[2] - 1) + 0.5f));
- _end = Coordinates(std::floor(x1 * (_input->info()->tensor_shape()[1] - 1) + 0.5f),
- std::floor(y1 * (_input->info()->tensor_shape()[2] - 1) + 0.5f));
- const TensorShape out_shape(_input->info()->tensor_shape()[0], abs(_end[0] - _start[0]) + 1, abs(_end[1] - _start[1]) + 1);
+ _end = Coordinates(std::floor(x1 * (_input->info()->tensor_shape()[1] - 1) + 0.5f),
+ std::floor(y1 * (_input->info()->tensor_shape()[2] - 1) + 0.5f));
+ const TensorShape out_shape(_input->info()->tensor_shape()[0], abs(_end[0] - _start[0]) + 1,
+ abs(_end[1] - _start[1]) + 1);
_output->info()->set_tensor_shape(out_shape);
bool is_width_flipped = _end[0] < _start[0];
bool is_height_flipped = _end[1] < _start[1];
- if(is_height_flipped)
+ if (is_height_flipped)
{
- _rows_out_of_bounds[0] = _start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(static_cast<uint32_t>(_start[1] - _input->info()->dimension(2) + 1),
- static_cast<uint32_t>(_output->info()->dimension(2))) :
- 0;
+ _rows_out_of_bounds[0] = _start[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+ ? std::min(static_cast<uint32_t>(_start[1] - _input->info()->dimension(2) + 1),
+ static_cast<uint32_t>(_output->info()->dimension(2)))
+ : 0;
_rows_out_of_bounds[1] = _end[1] < 0 ? std::min(static_cast<uint32_t>(-_end[1]),
- static_cast<uint32_t>(_output->info()->dimension(2))) :
- 0;
+ static_cast<uint32_t>(_output->info()->dimension(2)))
+ : 0;
}
else
{
_rows_out_of_bounds[0] = _start[1] < 0 ? std::min(static_cast<uint32_t>(-_start[1]),
- static_cast<uint32_t>(_output->info()->dimension(2))) :
- 0;
- _rows_out_of_bounds[1] = _end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(static_cast<uint32_t>(_end[1] - _input->info()->dimension(2) + 1),
- static_cast<uint32_t>(_output->info()->dimension(2))) :
- 0;
+ static_cast<uint32_t>(_output->info()->dimension(2)))
+ : 0;
+ _rows_out_of_bounds[1] = _end[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+ ? std::min(static_cast<uint32_t>(_end[1] - _input->info()->dimension(2) + 1),
+ static_cast<uint32_t>(_output->info()->dimension(2)))
+ : 0;
}
- if(is_width_flipped)
+ if (is_width_flipped)
{
- _cols_out_of_bounds[0] = _start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(static_cast<uint32_t>(_start[0] - _input->info()->dimension(1) + 1),
- static_cast<uint32_t>(_output->info()->dimension(1))) :
- 0;
+ _cols_out_of_bounds[0] = _start[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+ ? std::min(static_cast<uint32_t>(_start[0] - _input->info()->dimension(1) + 1),
+ static_cast<uint32_t>(_output->info()->dimension(1)))
+ : 0;
_cols_out_of_bounds[1] = _end[0] < 0 ? std::min(static_cast<uint32_t>(-_end[0]),
- static_cast<uint32_t>(_output->info()->dimension(1))) :
- 0;
+ static_cast<uint32_t>(_output->info()->dimension(1)))
+ : 0;
}
else
{
_cols_out_of_bounds[0] = _start[0] < 0 ? std::min(static_cast<uint32_t>(-_start[0]),
- static_cast<uint32_t>(_output->info()->dimension(1))) :
- 0;
- _cols_out_of_bounds[1] = _end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(static_cast<uint32_t>(_end[0] - _input->info()->dimension(1) + 1),
- static_cast<uint32_t>(_output->info()->dimension(1))) :
- 0;
+ static_cast<uint32_t>(_output->info()->dimension(1)))
+ : 0;
+ _cols_out_of_bounds[1] = _end[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+ ? std::min(static_cast<uint32_t>(_end[0] - _input->info()->dimension(1) + 1),
+ static_cast<uint32_t>(_output->info()->dimension(1)))
+ : 0;
}
INEKernel::configure(calculate_max_window(*_output->info()));
@@ -298,13 +321,18 @@ void NECropKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_ERROR_ON(_input->info()->has_padding());
ARM_COMPUTE_ERROR_ON(_output->info()->has_padding());
- const auto *uk = get_implementation(CropSelectorData{ _input->info()->data_type() });
+ const auto *uk = get_implementation(CropSelectorData{_input->info()->data_type()});
uint32_t batch_index = *(reinterpret_cast<int32_t *>(_box_ind->ptr_to_element(Coordinates(_crop_box_ind))));
- Coordinates input_offset(0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0],
- _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index);
- execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, uk->ukernel, _end[1] < _start[1],
- _cols_out_of_bounds[0] + _cols_out_of_bounds[1] < _output->info()->dimension(1), _cols_out_of_bounds[0] > 0, _cols_out_of_bounds[1] > 0,
+ Coordinates input_offset(
+ 0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0],
+ _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index);
+ execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds,
+ uk->ukernel,
+ _end[1]<_start[1],
+ _cols_out_of_bounds[0] +
+ _cols_out_of_bounds[1]<_output->info()->dimension(1), _cols_out_of_bounds[0]> 0,
+ _cols_out_of_bounds[1]> 0,
_start[0] <= _end[0], _end[0] < _start[0]);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NECropKernel.h b/src/core/NEON/kernels/NECropKernel.h
index 6c989c1d2c..da4a1b26e5 100644
--- a/src/core/NEON/kernels/NECropKernel.h
+++ b/src/core/NEON/kernels/NECropKernel.h
@@ -25,7 +25,7 @@
#define ARM_COMPUTE_NEON_CROP_KERNEL_H
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
@@ -67,7 +67,12 @@ public:
* @param[in] crop_box_ind Index of the crop box to be used from @p crop_boxes. Default is 0.
* @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
*/
- void configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0);
+ void configure(const ITensor *input,
+ const ITensor *crop_boxes,
+ const ITensor *box_ind,
+ ITensor *output,
+ uint32_t crop_box_ind = 0,
+ float extrapolation_value = 0);
/** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
*
@@ -82,7 +87,12 @@ public:
* @param[in] crop_box_ind Index of the crop box to be used from @p crop_boxes. Default is 0.
* @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *crop_boxes,
+ const ITensorInfo *box_ind,
+ const ITensorInfo *output,
+ uint32_t crop_box_ind = 0,
+ float extrapolation_value = 0);
/** Configure output tensor's shape as this can only be determined at runtime. */
void configure_output_shape();
@@ -91,7 +101,8 @@ public:
void run(const Window &window, const ThreadInfo &info) override;
/** Function to use for in bounds crop for the particular tensor types passed to configure() */
- using InBoundsCropFunction = void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool);
+ using InBoundsCropFunction =
+ void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool);
private:
const ITensor *_input;
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
index 6dcc85ec2e..de0079ee60 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
@@ -26,11 +26,12 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
#include <cstdint>
@@ -52,12 +53,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != 0);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape * input->tensor_shape()[idx_width]));
- ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape * input->tensor_shape()[idx_height]));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
+ (block_shape * input->tensor_shape()[idx_width]));
+ ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
+ (block_shape * input->tensor_shape()[idx_height]));
ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -74,7 +77,8 @@ NEDepthToSpaceLayerKernel::NEDepthToSpaceLayerKernel()
void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output, int32_t block_shape)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- TensorShape output_shape = compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
+ TensorShape output_shape =
+ compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
@@ -117,26 +121,27 @@ void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
// Main loop for NCHW and NHWC
- if(_data_layout == DataLayout::NCHW)
+ if (_data_layout == DataLayout::NCHW)
{
Window slice_in = window.first_slice_window_2D();
do
{
Iterator in(_input, slice_in);
- execute_window_loop(slice_in, [&](const Coordinates & id)
- {
- const int x = id.x();
- const int y = id.y();
-
- const int z = id.z() % r;
- const int out_x = x * _block_shape + (id.z() / r) % _block_shape;
- const int out_y = y * _block_shape + (id.z() / r) / _block_shape;
- Coordinates output_coords{ out_x, out_y, z, id[3] };
- memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
- },
- in);
- }
- while(window.slide_window_slice_2D(slice_in));
+ execute_window_loop(
+ slice_in,
+ [&](const Coordinates &id)
+ {
+ const int x = id.x();
+ const int y = id.y();
+
+ const int z = id.z() % r;
+ const int out_x = x * _block_shape + (id.z() / r) % _block_shape;
+ const int out_y = y * _block_shape + (id.z() / r) / _block_shape;
+ Coordinates output_coords{out_x, out_y, z, id[3]};
+ memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+ },
+ in);
+ } while (window.slide_window_slice_2D(slice_in));
}
else
{
@@ -144,20 +149,21 @@ void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
do
{
Iterator in(_input, slice_in);
- execute_window_loop(slice_in, [&](const Coordinates & id)
- {
- const int x = id.y();
- const int y = id.z();
-
- const int z = id.x() % r;
- const int out_x = x * _block_shape + (id.x() / r) % _block_shape;
- const int out_y = y * _block_shape + (id.x() / r) / _block_shape;
- Coordinates output_coords{ z, out_x, out_y, id[3] };
- memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
- },
- in);
- }
- while(window.slide_window_slice_3D(slice_in));
+ execute_window_loop(
+ slice_in,
+ [&](const Coordinates &id)
+ {
+ const int x = id.y();
+ const int y = id.z();
+
+ const int z = id.x() % r;
+ const int out_x = x * _block_shape + (id.x() / r) % _block_shape;
+ const int out_y = y * _block_shape + (id.x() / r) / _block_shape;
+ Coordinates output_coords{z, out_x, out_y, id[3]};
+ memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+ },
+ in);
+ } while (window.slide_window_slice_3D(slice_in));
}
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
index 261437f07d..a5969cd497 100644
--- a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
@@ -28,6 +28,7 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -37,16 +38,19 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *idx,
+ const FFTDigitReverseKernelInfo &config)
{
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32);
- ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x());
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -56,7 +60,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input,
+ ITensorInfo *output,
+ ITensorInfo *idx,
+ const FFTDigitReverseKernelInfo &config)
{
ARM_COMPUTE_UNUSED(idx, config);
@@ -68,12 +75,14 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
}
} // namespace
-NEFFTDigitReverseKernel::NEFFTDigitReverseKernel()
- : _func(nullptr), _input(nullptr), _output(nullptr), _idx(nullptr)
+NEFFTDigitReverseKernel::NEFFTDigitReverseKernel() : _func(nullptr), _input(nullptr), _output(nullptr), _idx(nullptr)
{
}
-void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, const ITensor *idx, const FFTDigitReverseKernelInfo &config)
+void NEFFTDigitReverseKernel::configure(const ITensor *input,
+ ITensor *output,
+ const ITensor *idx,
+ const FFTDigitReverseKernelInfo &config)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config));
@@ -91,11 +100,11 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
INEKernel::configure(win_config.second);
- if(axis == 0)
+ if (axis == 0)
{
- if(is_input_complex)
+ if (is_input_complex)
{
- if(is_conj)
+ if (is_conj)
{
_func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0<true, true>;
}
@@ -109,11 +118,11 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c
_func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0<false, false>;
}
}
- else if(axis == 1)
+ else if (axis == 1)
{
- if(is_input_complex)
+ if (is_input_complex)
{
- if(is_conj)
+ if (is_conj)
{
_func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1<true, true>;
}
@@ -133,10 +142,14 @@ void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, c
}
}
-Status NEFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status NEFFTDigitReverseKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *idx,
+ const FFTDigitReverseKernelInfo &config)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
return Status{};
}
@@ -159,38 +172,40 @@ void NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0(const Window &window)
std::vector<float> buffer_row_out(2 * N);
std::vector<float> buffer_row_in(2 * N);
- execute_window_loop(slice, [&](const Coordinates &)
- {
- if(is_input_complex)
+ execute_window_loop(
+ slice,
+ [&](const Coordinates &)
{
- // Load
- memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), 2 * N * sizeof(float));
-
- // Shuffle
- for(size_t x = 0; x < 2 * N; x += 2)
+ if (is_input_complex)
{
- size_t idx = buffer_idx[x / 2];
- buffer_row_out[x] = buffer_row_in[2 * idx];
- buffer_row_out[x + 1] = (is_conj ? -buffer_row_in[2 * idx + 1] : buffer_row_in[2 * idx + 1]);
- }
- }
- else
- {
- // Load
- memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), N * sizeof(float));
+ // Load
+ memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), 2 * N * sizeof(float));
- // Shuffle
- for(size_t x = 0; x < N; ++x)
+ // Shuffle
+ for (size_t x = 0; x < 2 * N; x += 2)
+ {
+ size_t idx = buffer_idx[x / 2];
+ buffer_row_out[x] = buffer_row_in[2 * idx];
+ buffer_row_out[x + 1] = (is_conj ? -buffer_row_in[2 * idx + 1] : buffer_row_in[2 * idx + 1]);
+ }
+ }
+ else
{
- size_t idx = buffer_idx[x];
- buffer_row_out[2 * x] = buffer_row_in[idx];
+ // Load
+ memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), N * sizeof(float));
+
+ // Shuffle
+ for (size_t x = 0; x < N; ++x)
+ {
+ size_t idx = buffer_idx[x];
+ buffer_row_out[2 * x] = buffer_row_in[idx];
+ }
}
- }
- // Copy back
- memcpy(reinterpret_cast<float *>(out.ptr()), buffer_row_out.data(), 2 * N * sizeof(float));
- },
- in, out);
+ // Copy back
+ memcpy(reinterpret_cast<float *>(out.ptr()), buffer_row_out.data(), 2 * N * sizeof(float));
+ },
+ in, out);
}
template <bool is_input_complex, bool is_conj>
@@ -215,39 +230,41 @@ void NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1(const Window &window)
const size_t stride_z = _input->info()->strides_in_bytes()[2];
const size_t stride_w = _input->info()->strides_in_bytes()[3];
- execute_window_loop(slice, [&](const Coordinates & id)
- {
- auto *out_ptr = reinterpret_cast<float *>(out.ptr());
- auto *in_ptr = reinterpret_cast<float *>(_input->buffer() + id.z() * stride_z + id[3] * stride_w);
- const size_t y_shuffled = buffer_idx[id.y()];
-
- if(is_input_complex)
+ execute_window_loop(
+ slice,
+ [&](const Coordinates &id)
{
- // Shuffle the entire row into the output
- memcpy(out_ptr, in_ptr + 2 * Nx * y_shuffled, 2 * Nx * sizeof(float));
+ auto *out_ptr = reinterpret_cast<float *>(out.ptr());
+ auto *in_ptr = reinterpret_cast<float *>(_input->buffer() + id.z() * stride_z + id[3] * stride_w);
+ const size_t y_shuffled = buffer_idx[id.y()];
- // Conjugate if necessary
- if(is_conj)
+ if (is_input_complex)
{
- for(size_t x = 0; x < 2 * Nx; x += 2)
+ // Shuffle the entire row into the output
+ memcpy(out_ptr, in_ptr + 2 * Nx * y_shuffled, 2 * Nx * sizeof(float));
+
+ // Conjugate if necessary
+ if (is_conj)
{
- out_ptr[x + 1] = -out_ptr[x + 1];
+ for (size_t x = 0; x < 2 * Nx; x += 2)
+ {
+ out_ptr[x + 1] = -out_ptr[x + 1];
+ }
}
}
- }
- else
- {
- // Shuffle the entire row into the buffer
- memcpy(buffer_row.data(), in_ptr + Nx * y_shuffled, Nx * sizeof(float));
-
- // Copy the buffer to the output, with a zero imaginary part
- for(size_t x = 0; x < 2 * Nx; x += 2)
+ else
{
- out_ptr[x] = buffer_row[x / 2];
+ // Shuffle the entire row into the buffer
+ memcpy(buffer_row.data(), in_ptr + Nx * y_shuffled, Nx * sizeof(float));
+
+ // Copy the buffer to the output, with a zero imaginary part
+ for (size_t x = 0; x < 2 * Nx; x += 2)
+ {
+ out_ptr[x] = buffer_row[x / 2];
+ }
}
- }
- },
- out);
+ },
+ out);
}
void NEFFTDigitReverseKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.h b/src/core/NEON/kernels/NEFFTDigitReverseKernel.h
index f436c364b2..ecf85ebc98 100644
--- a/src/core/NEON/kernels/NEFFTDigitReverseKernel.h
+++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H
#include "arm_compute/core/KernelDescriptors.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
@@ -70,7 +71,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *idx,
+ const FFTDigitReverseKernelInfo &config);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
index 44c841f626..4b58a7b9ac 100644
--- a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
@@ -28,10 +28,11 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/traits.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include "support/ToolchainSupport.h"
#include <arm_neon.h>
@@ -70,7 +71,7 @@ float32x2_t c_mul_neon(float32x2_t a, float32x2_t b)
{
using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
- const float32x2_t mask = { -1.0, 1.0 };
+ const float32x2_t mask = {-1.0, 1.0};
const float32x2_t tmp0 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
const float32x2_t tmp1 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
@@ -88,7 +89,7 @@ float32x2_t c_mul_neon_img(float32x2_t a, float img_constant)
const float a_r = wrapper::vgetlane(a, 0);
const float a_i = wrapper::vgetlane(a, 1);
- const auto out = wrapper::vmul(float32x2_t{ -a_i, a_r }, float32x2_t{ img_constant, img_constant });
+ const auto out = wrapper::vmul(float32x2_t{-a_i, a_r}, float32x2_t{img_constant, img_constant});
return out;
}
@@ -100,7 +101,8 @@ float32x2_t reduce_sum_5(float32x2_t a, float32x2_t b, float32x2_t c, float32x2_
return wrapper::vadd(t2, e);
}
-float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7)
+float32x2_t reduce_sum_7(
+ float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7)
{
const auto t0 = wrapper::vadd(x1, x2);
const auto t1 = wrapper::vadd(x3, x4);
@@ -111,7 +113,14 @@ float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32
return wrapper::vadd(t00, t01);
}
-float32x2_t reduce_sum_8(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7, float32x2_t x8)
+float32x2_t reduce_sum_8(float32x2_t x1,
+ float32x2_t x2,
+ float32x2_t x3,
+ float32x2_t x4,
+ float32x2_t x5,
+ float32x2_t x6,
+ float32x2_t x7,
+ float32x2_t x8)
{
const auto t0 = wrapper::vadd(x1, x2);
const auto t1 = wrapper::vadd(x3, x4);
@@ -141,15 +150,21 @@ void fft_3(float32x2_t &x, float32x2_t &y, float32x2_t &z, const float32x2_t &w,
x = wrapper::vadd(a, b);
x = wrapper::vadd(x, c);
- const auto v1 = wrapper::vmul(float32x2_t{ 0.5f, 0.5 }, wrapper::vadd(b, c));
- const auto v2 = c_mul_neon(float32x2_t{ 0.f, -kSqrt3Div2 }, wrapper::vsub(b, c));
+ const auto v1 = wrapper::vmul(float32x2_t{0.5f, 0.5}, wrapper::vadd(b, c));
+ const auto v2 = c_mul_neon(float32x2_t{0.f, -kSqrt3Div2}, wrapper::vsub(b, c));
y = z = wrapper::vsub(a, v1);
y = wrapper::vadd(y, v2);
z = wrapper::vsub(z, v2);
}
-void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3)
+void fft_4(float32x2_t &x1,
+ float32x2_t &x2,
+ float32x2_t &x3,
+ float32x2_t &x4,
+ const float32x2_t &w,
+ const float32x2_t &w2,
+ const float32x2_t &w3)
{
float32x2_t a = x1;
float32x2_t b = c_mul_neon(w, x2);
@@ -173,7 +188,15 @@ void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, c
x4 = wrapper::vadd(x41, x42);
}
-void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3, const float32x2_t &w4)
+void fft_5(float32x2_t &x1,
+ float32x2_t &x2,
+ float32x2_t &x3,
+ float32x2_t &x4,
+ float32x2_t &x5,
+ const float32x2_t &w,
+ const float32x2_t &w2,
+ const float32x2_t &w3,
+ const float32x2_t &w4)
{
const auto a = x1;
const auto b = c_mul_neon(w, x2);
@@ -181,25 +204,25 @@ void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
const auto d = c_mul_neon(w3, x4);
const auto e = c_mul_neon(w4, x5);
- const auto b0 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, b);
- const auto b1 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, b);
- const auto b2 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, b);
- const auto b3 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, b);
+ const auto b0 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, b);
+ const auto b1 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, b);
+ const auto b2 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, b);
+ const auto b3 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, b);
- const auto c0 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, c);
- const auto c1 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, c);
- const auto c2 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, c);
- const auto c3 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, c);
+ const auto c0 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, c);
+ const auto c1 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, c);
+ const auto c2 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, c);
+ const auto c3 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, c);
- const auto d0 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, d);
- const auto d1 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, d);
- const auto d2 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, d);
- const auto d3 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, d);
+ const auto d0 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, d);
+ const auto d1 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, d);
+ const auto d2 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, d);
+ const auto d3 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, d);
- const auto e0 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, e);
- const auto e1 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, e);
- const auto e2 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, e);
- const auto e3 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, e);
+ const auto e0 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, e);
+ const auto e1 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, e);
+ const auto e2 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, e);
+ const auto e3 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, e);
x1 = reduce_sum_5(a, b, c, d, e);
x2 = reduce_sum_5(a, b0, c0, d0, e0);
@@ -208,9 +231,19 @@ void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
x5 = reduce_sum_5(a, b3, c3, d3, e3);
}
-void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3,
+void fft_7(float32x2_t &x1,
+ float32x2_t &x2,
+ float32x2_t &x3,
+ float32x2_t &x4,
+ float32x2_t &x5,
+ float32x2_t &x6,
+ float32x2_t &x7,
+ const float32x2_t &w,
+ const float32x2_t &w2,
+ const float32x2_t &w3,
const float32x2_t &w4,
- const float32x2_t &w5, const float32x2_t &w6)
+ const float32x2_t &w5,
+ const float32x2_t &w6)
{
const auto a = x1;
const auto b = c_mul_neon(w, x2);
@@ -220,47 +253,47 @@ void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
const auto f = c_mul_neon(w5, x6);
const auto g = c_mul_neon(w6, x7);
- const auto b0 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, b);
- const auto b1 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, b);
- const auto b2 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, b);
- const auto b3 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, b);
- const auto b4 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, b);
- const auto b5 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, b);
-
- const auto c0 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, c);
- const auto c1 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, c);
- const auto c2 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, c);
- const auto c3 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, c);
- const auto c4 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, c);
- const auto c5 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, c);
-
- const auto d0 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, d);
- const auto d1 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, d);
- const auto d2 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, d);
- const auto d3 = c_mul_neon(float32x2_t{ -kW7_2, +kW7_3 }, d);
- const auto d4 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, d);
- const auto d5 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, d);
-
- const auto e0 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, e);
- const auto e1 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, e);
- const auto e2 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, e);
- const auto e3 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, e);
- const auto e4 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, e);
- const auto e5 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, e);
-
- const auto f0 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, f);
- const auto f1 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, f);
- const auto f2 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, f);
- const auto f3 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, f);
- const auto f4 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, f);
- const auto f5 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, f);
-
- const auto g0 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, g);
- const auto g1 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, g);
- const auto g2 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, g);
- const auto g3 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, g);
- const auto g4 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, g);
- const auto g5 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, g);
+ const auto b0 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, b);
+ const auto b1 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, b);
+ const auto b2 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, b);
+ const auto b3 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, b);
+ const auto b4 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, b);
+ const auto b5 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, b);
+
+ const auto c0 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, c);
+ const auto c1 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, c);
+ const auto c2 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, c);
+ const auto c3 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, c);
+ const auto c4 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, c);
+ const auto c5 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, c);
+
+ const auto d0 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, d);
+ const auto d1 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, d);
+ const auto d2 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, d);
+ const auto d3 = c_mul_neon(float32x2_t{-kW7_2, +kW7_3}, d);
+ const auto d4 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, d);
+ const auto d5 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, d);
+
+ const auto e0 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, e);
+ const auto e1 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, e);
+ const auto e2 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, e);
+ const auto e3 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, e);
+ const auto e4 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, e);
+ const auto e5 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, e);
+
+ const auto f0 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, f);
+ const auto f1 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, f);
+ const auto f2 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, f);
+ const auto f3 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, f);
+ const auto f4 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, f);
+ const auto f5 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, f);
+
+ const auto g0 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, g);
+ const auto g1 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, g);
+ const auto g2 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, g);
+ const auto g3 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, g);
+ const auto g4 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, g);
+ const auto g5 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, g);
x1 = reduce_sum_7(a, b, c, d, e, f, g);
x2 = reduce_sum_7(a, b0, c0, d0, e0, f0, g0);
@@ -271,9 +304,20 @@ void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
x7 = reduce_sum_7(a, b5, c5, d5, e5, f5, g5);
}
-void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, float32x2_t &x8, const float32x2_t &w, const float32x2_t &w2,
+void fft_8(float32x2_t &x1,
+ float32x2_t &x2,
+ float32x2_t &x3,
+ float32x2_t &x4,
+ float32x2_t &x5,
+ float32x2_t &x6,
+ float32x2_t &x7,
+ float32x2_t &x8,
+ const float32x2_t &w,
+ const float32x2_t &w2,
const float32x2_t &w3,
- const float32x2_t &w4, const float32x2_t &w5, const float32x2_t &w6,
+ const float32x2_t &w4,
+ const float32x2_t &w5,
+ const float32x2_t &w6,
const float32x2_t &w7)
{
const auto a = x1;
@@ -285,61 +329,61 @@ void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
const auto g = c_mul_neon(w6, x7);
const auto h = c_mul_neon(w7, x8);
- const auto b0 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, b);
- const auto b1 = c_mul_neon(float32x2_t{ 0, -1 }, b);
- const auto b2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, b);
- const auto b3 = c_mul_neon(float32x2_t{ -1, 0 }, b);
- const auto b4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, b);
- const auto b5 = c_mul_neon(float32x2_t{ 0, 1 }, b);
- const auto b6 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, b);
-
- const auto c0 = c_mul_neon(float32x2_t{ 0, -1 }, c);
- const auto c1 = c_mul_neon(float32x2_t{ -1, 0 }, c);
- const auto c2 = c_mul_neon(float32x2_t{ 0, 1 }, c);
- const auto c3 = c_mul_neon(float32x2_t{ 1, 0 }, c);
- const auto c4 = c_mul_neon(float32x2_t{ 0, -1 }, c);
- const auto c5 = c_mul_neon(float32x2_t{ -1, 0 }, c);
- const auto c6 = c_mul_neon(float32x2_t{ 0, 1 }, c);
-
- const auto d0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, d);
- const auto d1 = c_mul_neon(float32x2_t{ 0, 1 }, d);
- const auto d2 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, d);
- const auto d3 = c_mul_neon(float32x2_t{ -1, 0 }, d);
- const auto d4 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, d);
- const auto d5 = c_mul_neon(float32x2_t{ 0, -1 }, d);
- const auto d6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, d);
-
- const auto e0 = c_mul_neon(float32x2_t{ -1, 0 }, e);
- const auto e1 = c_mul_neon(float32x2_t{ 1, 0 }, e);
- const auto e2 = c_mul_neon(float32x2_t{ -1, 0 }, e);
- const auto e3 = c_mul_neon(float32x2_t{ 1, 0 }, e);
- const auto e4 = c_mul_neon(float32x2_t{ -1, 0 }, e);
- const auto e5 = c_mul_neon(float32x2_t{ 1, 0 }, e);
- const auto e6 = c_mul_neon(float32x2_t{ -1, 0 }, e);
-
- const auto f0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, f);
- const auto f1 = c_mul_neon(float32x2_t{ 0, -1 }, f);
- const auto f2 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, f);
- const auto f3 = c_mul_neon(float32x2_t{ -1, 0 }, f);
- const auto f4 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, f);
- const auto f5 = c_mul_neon(float32x2_t{ 0, 1 }, f);
- const auto f6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, f);
-
- const auto g0 = c_mul_neon(float32x2_t{ 0, 1 }, g);
- const auto g1 = c_mul_neon(float32x2_t{ -1, 0 }, g);
- const auto g2 = c_mul_neon(float32x2_t{ 0, -1 }, g);
- const auto g3 = c_mul_neon(float32x2_t{ 1, 0 }, g);
- const auto g4 = c_mul_neon(float32x2_t{ 0, 1 }, g);
- const auto g5 = c_mul_neon(float32x2_t{ -1, 0 }, g);
- const auto g6 = c_mul_neon(float32x2_t{ 0, -1 }, g);
-
- const auto h0 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, h);
- const auto h1 = c_mul_neon(float32x2_t{ 0, 1 }, h);
- const auto h2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, h);
- const auto h3 = c_mul_neon(float32x2_t{ -1, 0 }, h);
- const auto h4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, h);
- const auto h5 = c_mul_neon(float32x2_t{ 0, -1 }, h);
- const auto h6 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, h);
+ const auto b0 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, b);
+ const auto b1 = c_mul_neon(float32x2_t{0, -1}, b);
+ const auto b2 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, b);
+ const auto b3 = c_mul_neon(float32x2_t{-1, 0}, b);
+ const auto b4 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, b);
+ const auto b5 = c_mul_neon(float32x2_t{0, 1}, b);
+ const auto b6 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, b);
+
+ const auto c0 = c_mul_neon(float32x2_t{0, -1}, c);
+ const auto c1 = c_mul_neon(float32x2_t{-1, 0}, c);
+ const auto c2 = c_mul_neon(float32x2_t{0, 1}, c);
+ const auto c3 = c_mul_neon(float32x2_t{1, 0}, c);
+ const auto c4 = c_mul_neon(float32x2_t{0, -1}, c);
+ const auto c5 = c_mul_neon(float32x2_t{-1, 0}, c);
+ const auto c6 = c_mul_neon(float32x2_t{0, 1}, c);
+
+ const auto d0 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, d);
+ const auto d1 = c_mul_neon(float32x2_t{0, 1}, d);
+ const auto d2 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, d);
+ const auto d3 = c_mul_neon(float32x2_t{-1, 0}, d);
+ const auto d4 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, d);
+ const auto d5 = c_mul_neon(float32x2_t{0, -1}, d);
+ const auto d6 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, d);
+
+ const auto e0 = c_mul_neon(float32x2_t{-1, 0}, e);
+ const auto e1 = c_mul_neon(float32x2_t{1, 0}, e);
+ const auto e2 = c_mul_neon(float32x2_t{-1, 0}, e);
+ const auto e3 = c_mul_neon(float32x2_t{1, 0}, e);
+ const auto e4 = c_mul_neon(float32x2_t{-1, 0}, e);
+ const auto e5 = c_mul_neon(float32x2_t{1, 0}, e);
+ const auto e6 = c_mul_neon(float32x2_t{-1, 0}, e);
+
+ const auto f0 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, f);
+ const auto f1 = c_mul_neon(float32x2_t{0, -1}, f);
+ const auto f2 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, f);
+ const auto f3 = c_mul_neon(float32x2_t{-1, 0}, f);
+ const auto f4 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, f);
+ const auto f5 = c_mul_neon(float32x2_t{0, 1}, f);
+ const auto f6 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, f);
+
+ const auto g0 = c_mul_neon(float32x2_t{0, 1}, g);
+ const auto g1 = c_mul_neon(float32x2_t{-1, 0}, g);
+ const auto g2 = c_mul_neon(float32x2_t{0, -1}, g);
+ const auto g3 = c_mul_neon(float32x2_t{1, 0}, g);
+ const auto g4 = c_mul_neon(float32x2_t{0, 1}, g);
+ const auto g5 = c_mul_neon(float32x2_t{-1, 0}, g);
+ const auto g6 = c_mul_neon(float32x2_t{0, -1}, g);
+
+ const auto h0 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, h);
+ const auto h1 = c_mul_neon(float32x2_t{0, 1}, h);
+ const auto h2 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, h);
+ const auto h3 = c_mul_neon(float32x2_t{-1, 0}, h);
+ const auto h4 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, h);
+ const auto h5 = c_mul_neon(float32x2_t{0, -1}, h);
+ const auto h6 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, h);
x1 = reduce_sum_8(a, b, c, d, e, f, g, h);
x2 = reduce_sum_8(a, b0, c0, d0, e0, f0, g0, h0);
@@ -352,18 +396,19 @@ void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, f
}
template <bool first_stage>
-void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_2_axes_0(
+ float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
- for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
{
- auto a = float32x2_t{ 0, 0 };
- auto b = float32x2_t{ 0, 0 };
+ auto a = float32x2_t{0, 0};
+ auto b = float32x2_t{0, 0};
// Load inputs
- if(first_stage)
+ if (first_stage)
{
const auto ab = wrapper::vloadq(in + k);
a = wrapper::vgetlow(ab);
@@ -379,7 +424,7 @@ void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
fft_2(a, b, w);
// Write outputs
- if(first_stage)
+ if (first_stage)
{
wrapper::vstore(out + k, wrapper::vcombine(a, b));
}
@@ -394,12 +439,20 @@ void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
}
}
-void fft_radix_2_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_2_axes_1(float *out,
+ float *in,
+ unsigned int Nx,
+ unsigned int NxRadix,
+ const float32x2_t &w_m,
+ unsigned int N,
+ unsigned int M,
+ unsigned int in_pad_x,
+ unsigned int out_pad_x)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
- for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
{
// Load inputs
float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -418,20 +471,21 @@ void fft_radix_2_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
}
template <bool first_stage>
-void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_3_axes_0(
+ float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
const auto w2 = c_mul_neon(w, w);
- for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
{
// Load inputs
- float32x2_t a = { 0, 0 };
- float32x2_t b = { 0, 0 };
- float32x2_t c = { 0, 0 };
- if(first_stage)
+ float32x2_t a = {0, 0};
+ float32x2_t b = {0, 0};
+ float32x2_t c = {0, 0};
+ if (first_stage)
{
const auto ab = wrapper::vloadq(in + k);
a = wrapper::vgetlow(ab);
@@ -447,7 +501,7 @@ void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
// Base-case prime transform
fft_3(a, b, c, w, w2);
- if(first_stage)
+ if (first_stage)
{
wrapper::vstore(out + k, wrapper::vcombine(a, b));
}
@@ -462,14 +516,22 @@ void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
}
}
-void fft_radix_3_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_3_axes_1(float *out,
+ float *in,
+ unsigned int Nx,
+ unsigned int NxRadix,
+ const float32x2_t &w_m,
+ unsigned int N,
+ unsigned int M,
+ unsigned int in_pad_x,
+ unsigned int out_pad_x)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
const auto w2 = c_mul_neon(w, w);
- for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
{
// Load inputs
float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -489,21 +551,22 @@ void fft_radix_3_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
}
template <bool first_stage>
-void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_4_axes_0(
+ float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
const auto w2 = c_mul_neon(w, w);
const auto w3 = c_mul_neon(w2, w);
- for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
{
- float32x2_t a = { 0, 0 };
- float32x2_t b = { 0, 0 };
- float32x2_t c = { 0, 0 };
- float32x2_t d = { 0, 0 };
- if(first_stage)
+ float32x2_t a = {0, 0};
+ float32x2_t b = {0, 0};
+ float32x2_t c = {0, 0};
+ float32x2_t d = {0, 0};
+ if (first_stage)
{
const auto ab = wrapper::vloadq(in + k);
const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -524,7 +587,7 @@ void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
// Base-case prime transform
fft_4(a, b, c, d, w, w2, w3);
- if(first_stage)
+ if (first_stage)
{
wrapper::vstore(out + k, wrapper::vcombine(a, b));
wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -542,15 +605,23 @@ void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
}
}
-void fft_radix_4_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_4_axes_1(float *out,
+ float *in,
+ unsigned int Nx,
+ unsigned int NxRadix,
+ const float32x2_t &w_m,
+ unsigned int N,
+ unsigned int M,
+ unsigned int in_pad_x,
+ unsigned int out_pad_x)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
const auto w2 = c_mul_neon(w, w);
const auto w3 = c_mul_neon(w2, w);
- for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
{
// Load inputs
float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -572,25 +643,26 @@ void fft_radix_4_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
}
template <bool first_stage>
-void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_5_axes_0(
+ float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
const float32x2_t w2 = c_mul_neon(w, w);
const float32x2_t w3 = c_mul_neon(w2, w);
const float32x2_t w4 = c_mul_neon(w3, w);
- for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
{
- float32x2_t a = { 0, 0 };
- float32x2_t b = { 0, 0 };
- float32x2_t c = { 0, 0 };
- float32x2_t d = { 0, 0 };
- float32x2_t e = { 0, 0 };
+ float32x2_t a = {0, 0};
+ float32x2_t b = {0, 0};
+ float32x2_t c = {0, 0};
+ float32x2_t d = {0, 0};
+ float32x2_t e = {0, 0};
// Load inputs
- if(first_stage)
+ if (first_stage)
{
const auto ab = wrapper::vloadq(in + k);
const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -613,7 +685,7 @@ void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
fft_5(a, b, c, d, e, w, w2, w3, w4);
// Store outputs
- if(first_stage)
+ if (first_stage)
{
wrapper::vstore(out + k, wrapper::vcombine(a, b));
wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -632,16 +704,24 @@ void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
}
}
-void fft_radix_5_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_5_axes_1(float *out,
+ float *in,
+ unsigned int Nx,
+ unsigned int NxRadix,
+ const float32x2_t &w_m,
+ unsigned int N,
+ unsigned int M,
+ unsigned int in_pad_x,
+ unsigned int out_pad_x)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
const float32x2_t w2 = c_mul_neon(w, w);
const float32x2_t w3 = c_mul_neon(w2, w);
const float32x2_t w4 = c_mul_neon(w3, w);
- for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
{
// Load inputs
float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -666,10 +746,11 @@ void fft_radix_5_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
}
template <bool first_stage>
-void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_7_axes_0(
+ float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
const float32x2_t w2 = c_mul_neon(w, w);
const float32x2_t w3 = c_mul_neon(w2, w);
@@ -677,18 +758,18 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
const float32x2_t w5 = c_mul_neon(w4, w);
const float32x2_t w6 = c_mul_neon(w5, w);
- for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
{
- float32x2_t a = { 0, 0 };
- float32x2_t b = { 0, 0 };
- float32x2_t c = { 0, 0 };
- float32x2_t d = { 0, 0 };
- float32x2_t e = { 0, 0 };
- float32x2_t f = { 0, 0 };
- float32x2_t g = { 0, 0 };
+ float32x2_t a = {0, 0};
+ float32x2_t b = {0, 0};
+ float32x2_t c = {0, 0};
+ float32x2_t d = {0, 0};
+ float32x2_t e = {0, 0};
+ float32x2_t f = {0, 0};
+ float32x2_t g = {0, 0};
// Load inputs
- if(first_stage)
+ if (first_stage)
{
const auto ab = wrapper::vloadq(in + k);
const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -715,7 +796,7 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
// Base-case prime transform
fft_7(a, b, c, d, e, f, g, w, w2, w3, w4, w5, w6);
- if(first_stage)
+ if (first_stage)
{
wrapper::vstore(out + k, wrapper::vcombine(a, b));
wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -737,10 +818,18 @@ void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
}
}
-void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_7_axes_1(float *out,
+ float *in,
+ unsigned int Nx,
+ unsigned int NxRadix,
+ const float32x2_t &w_m,
+ unsigned int N,
+ unsigned int M,
+ unsigned int in_pad_x,
+ unsigned int out_pad_x)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
const float32x2_t w2 = c_mul_neon(w, w);
const float32x2_t w3 = c_mul_neon(w2, w);
@@ -748,7 +837,7 @@ void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
const float32x2_t w5 = c_mul_neon(w4, w);
const float32x2_t w6 = c_mul_neon(w5, w);
- for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
{
// Load inputs
float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -777,10 +866,11 @@ void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
}
template <bool first_stage>
-void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_8_axes_0(
+ float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
const float32x2_t w2 = c_mul_neon(w, w);
const float32x2_t w3 = c_mul_neon(w2, w);
@@ -789,20 +879,20 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
const float32x2_t w6 = c_mul_neon(w5, w);
const float32x2_t w7 = c_mul_neon(w6, w);
- for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
{
// Load inputs
- float32x2_t a = { 0, 0 };
- float32x2_t b = { 0, 0 };
- float32x2_t c = { 0, 0 };
- float32x2_t d = { 0, 0 };
- float32x2_t e = { 0, 0 };
- float32x2_t f = { 0, 0 };
- float32x2_t g = { 0, 0 };
- float32x2_t h = { 0, 0 };
+ float32x2_t a = {0, 0};
+ float32x2_t b = {0, 0};
+ float32x2_t c = {0, 0};
+ float32x2_t d = {0, 0};
+ float32x2_t e = {0, 0};
+ float32x2_t f = {0, 0};
+ float32x2_t g = {0, 0};
+ float32x2_t h = {0, 0};
// Base-case prime transform
- if(first_stage)
+ if (first_stage)
{
const auto ab = wrapper::vloadq(in + k);
const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -834,7 +924,7 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
fft_8(a, b, c, d, e, f, g, h, w, w2, w3, w4, w5, w6, w7);
// Store outputs
- if(first_stage)
+ if (first_stage)
{
wrapper::vstore(out + k, wrapper::vcombine(a, b));
wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -858,10 +948,18 @@ void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxR
}
}
-void fft_radix_8_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_8_axes_1(float *out,
+ float *in,
+ unsigned int Nx,
+ unsigned int NxRadix,
+ const float32x2_t &w_m,
+ unsigned int N,
+ unsigned int M,
+ unsigned int in_pad_x,
+ unsigned int out_pad_x)
{
- float32x2_t w{ 1.0f, 0.0f };
- for(unsigned int j = 0; j < Nx; j++)
+ float32x2_t w{1.0f, 0.0f};
+ for (unsigned int j = 0; j < Nx; j++)
{
const float32x2_t w2 = c_mul_neon(w, w);
const float32x2_t w3 = c_mul_neon(w2, w);
@@ -870,7 +968,7 @@ void fft_radix_8_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxR
const float32x2_t w6 = c_mul_neon(w5, w);
const float32x2_t w7 = c_mul_neon(w6, w);
- for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+ for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
{
// Load inputs
float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -908,7 +1006,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
ARM_COMPUTE_UNUSED(config);
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -917,11 +1015,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
{
ARM_COMPUTE_UNUSED(config);
- if(output != nullptr)
+ if (output != nullptr)
{
auto_init_if_empty(*output, *input);
}
@@ -942,7 +1041,7 @@ void NEFFTRadixStageKernel::set_radix_stage_axis0(const FFTRadixStageKernelInfo
// FFT table axis 0: [radix, first_stage]
static std::map<unsigned int, std::map<bool, FFTFunctionPointerAxis0>> fft_table_axis0;
- if(fft_table_axis0.empty())
+ if (fft_table_axis0.empty())
{
fft_table_axis0[2][false] = &fft_radix_2_axes_0<false>;
fft_table_axis0[3][false] = &fft_radix_3_axes_0<false>;
@@ -967,7 +1066,7 @@ void NEFFTRadixStageKernel::set_radix_stage_axis1(const FFTRadixStageKernelInfo
// FFT table axis 1: [radix, first_stage]
static std::map<unsigned int, FFTFunctionPointerAxis1> fft_table_axis1;
- if(fft_table_axis1.empty())
+ if (fft_table_axis1.empty())
{
fft_table_axis1[2] = &fft_radix_2_axes_1;
fft_table_axis1[3] = &fft_radix_3_axes_1;
@@ -985,12 +1084,13 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
// Output auto inizialitation if not yet initialized
- if(output != nullptr)
+ if (output != nullptr)
{
auto_init_if_empty(*output->info(), *input->info()->clone());
}
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
_input = input;
_output = (output == nullptr) ? input : output;
@@ -998,7 +1098,7 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT
_axis = config.axis;
_radix = config.radix;
- switch(config.axis)
+ switch (config.axis)
{
case 0:
set_radix_stage_axis0(config);
@@ -1012,26 +1112,28 @@ void NEFFTRadixStageKernel::configure(ITensor *input, ITensor *output, const FFT
}
// Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr, config);
+ auto win_config =
+ validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr, config);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
INEKernel::configure(win_config.second);
}
-Status NEFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+Status NEFFTRadixStageKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const FFTRadixStageKernelInfo &config)
{
const bool run_in_place = (output == nullptr) || (output == input);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
- (run_in_place) ? nullptr : output->clone().get(),
- config)
- .first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config)
+ .first);
return Status{};
}
std::set<unsigned int> NEFFTRadixStageKernel::supported_radix()
{
- return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 };
+ return std::set<unsigned int>{2, 3, 4, 5, 7, 8};
}
void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info)
@@ -1049,28 +1151,32 @@ void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info)
// Precompute FFT constants
const unsigned int NxRadix = _radix * _Nx;
const float alpha = 2.0f * kPi / float(NxRadix);
- const float32x2_t w_m{ cosf(alpha), -sinf(alpha) };
+ const float32x2_t w_m{cosf(alpha), -sinf(alpha)};
- if(_axis == 0)
+ if (_axis == 0)
{
const unsigned int N = _input->info()->dimension(0);
- execute_window_loop(input_window, [&](const Coordinates &)
- {
- _func_0(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N);
- },
- in, out);
+ execute_window_loop(
+ input_window,
+ [&](const Coordinates &) {
+ _func_0(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m,
+ N);
+ },
+ in, out);
}
else
{
const unsigned int N = _input->info()->dimension(0);
const unsigned int M = _input->info()->dimension(1);
- execute_window_loop(input_window, [&](const Coordinates &)
- {
- _func_1(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N, M,
- _input->info()->padding().right + _input->info()->padding().left,
- _output->info()->padding().right + _output->info()->padding().left);
- },
- in, out);
+ execute_window_loop(
+ input_window,
+ [&](const Coordinates &)
+ {
+ _func_1(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N,
+ M, _input->info()->padding().right + _input->info()->padding().left,
+ _output->info()->padding().right + _output->info()->padding().left);
+ },
+ in, out);
}
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.h b/src/core/NEON/kernels/NEFFTRadixStageKernel.h
index 2291a1068c..54f32efa23 100644
--- a/src/core/NEON/kernels/NEFFTRadixStageKernel.h
+++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H
#include "arm_compute/core/KernelDescriptors.h"
+
#include "src/core/NEON/INEKernel.h"
#include <arm_neon.h>
@@ -92,8 +93,17 @@ private:
void set_radix_stage_axis0(const FFTRadixStageKernelInfo &config);
void set_radix_stage_axis1(const FFTRadixStageKernelInfo &config);
- using FFTFunctionPointerAxis0 = std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int)>;
- using FFTFunctionPointerAxis1 = std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int, unsigned int, unsigned int, unsigned int)>;
+ using FFTFunctionPointerAxis0 =
+ std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int)>;
+ using FFTFunctionPointerAxis1 = std::function<void(float *,
+ float *,
+ unsigned int,
+ unsigned int,
+ const float32x2_t &,
+ unsigned int,
+ unsigned int,
+ unsigned int,
+ unsigned int)>;
FFTFunctionPointerAxis0 _func_0;
FFTFunctionPointerAxis1 _func_1;
diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.cpp b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
index 5ec330bebc..9fe561fc59 100644
--- a/src/core/NEON/kernels/NEFFTScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
@@ -28,9 +28,10 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
@@ -41,8 +42,8 @@ namespace
void scale_complex(float *c_in, float *c_out, bool is_conjugate, float scale)
{
const auto a = wrapper::vload(c_in);
- auto b = wrapper::vdiv(a, float32x2_t{ scale, scale });
- if(is_conjugate)
+ auto b = wrapper::vdiv(a, float32x2_t{scale, scale});
+ if (is_conjugate)
{
const float img_part = wrapper::vgetlane(b, 1);
b = wrapper::vsetlane(-img_part, b, 1);
@@ -56,7 +57,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -71,7 +72,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
// Configure kernel window
Window win = calculate_max_window(*input, Steps());
- if(output != nullptr)
+ if (output != nullptr)
{
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output, *input->clone());
@@ -126,10 +127,10 @@ void NEFFTScaleKernel::run(const Window &window, const ThreadInfo &info)
Iterator in(_input, input_window);
Iterator out(_run_in_place ? _input : _output, input_window);
- execute_window_loop(window, [&](const Coordinates &)
- {
- scale_complex(reinterpret_cast<float *>(in.ptr()), reinterpret_cast<float *>(out.ptr()), _is_conj, _scale);
- },
- in, out);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
+ { scale_complex(reinterpret_cast<float *>(in.ptr()), reinterpret_cast<float *>(out.ptr()), _is_conj, _scale); },
+ in, out);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.h b/src/core/NEON/kernels/NEFFTScaleKernel.h
index 24a19f98ba..608cf5ea34 100644
--- a/src/core/NEON/kernels/NEFFTScaleKernel.h
+++ b/src/core/NEON/kernels/NEFFTScaleKernel.h
@@ -24,10 +24,10 @@
#ifndef ARM_COMPUTE_NEFFTSCALEKERNEL_H
#define ARM_COMPUTE_NEFFTSCALEKERNEL_H
-#include "src/core/NEON/INEKernel.h"
-
#include "arm_compute/core/KernelDescriptors.h"
+#include "src/core/NEON/INEKernel.h"
+
namespace arm_compute
{
// Forward declarations
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index 1c7c1f9763..00b0c0ae8d 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -30,14 +30,19 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
namespace arm_compute
{
namespace
{
-inline void fill_constant_value_single_channel_special(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value)
+inline void fill_constant_value_single_channel_special(ITensor *tensor,
+ const Window &window,
+ unsigned int right,
+ unsigned int bottom,
+ const PixelValue &constant_border_value)
{
float border_value;
constant_border_value.get(border_value);
@@ -52,39 +57,43 @@ inline void fill_constant_value_single_channel_special(ITensor *tensor, const Wi
Iterator vertical_it(tensor, vertical);
- execute_window_loop(vertical, [&](const Coordinates &)
- {
- const auto row_start = reinterpret_cast<float *>(start_valid_region + vertical_it.offset());
+ execute_window_loop(
+ vertical,
+ [&](const Coordinates &)
+ {
+ const auto row_start = reinterpret_cast<float *>(start_valid_region + vertical_it.offset());
- // Fill left and right borders
- *(row_start - 1) = border_value;
- std::fill_n(row_start + width, right, border_value);
- },
- vertical_it);
+ // Fill left and right borders
+ *(row_start - 1) = border_value;
+ std::fill_n(row_start + width, right, border_value);
+ },
+ vertical_it);
// Top and bottom border
Iterator plane_it(tensor, window);
// Iterate over all XY planes
- execute_window_loop(window, [&](const Coordinates &)
- {
- uint8_t *base_addr = start_valid_region + plane_it.offset();
- // Top border
- const auto row_start = reinterpret_cast<float *>(base_addr - stridey);
- // Fill top rows including left/right borders
- std::fill_n(row_start - 1, 1 + width + right, border_value);
-
- // Bottom border
- const unsigned low_border_size = height + bottom;
- for(unsigned int i = height; i < low_border_size; ++i)
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
{
- const auto row_start = reinterpret_cast<float *>(base_addr + i * stridey);
-
- // Fill bottom rows including left/right borders
+ uint8_t *base_addr = start_valid_region + plane_it.offset();
+ // Top border
+ const auto row_start = reinterpret_cast<float *>(base_addr - stridey);
+ // Fill top rows including left/right borders
std::fill_n(row_start - 1, 1 + width + right, border_value);
- }
- },
- plane_it);
+
+ // Bottom border
+ const unsigned low_border_size = height + bottom;
+ for (unsigned int i = height; i < low_border_size; ++i)
+ {
+ const auto row_start = reinterpret_cast<float *>(base_addr + i * stridey);
+
+ // Fill bottom rows including left/right borders
+ std::fill_n(row_start - 1, 1 + width + right, border_value);
+ }
+ },
+ plane_it);
}
} // namespace
@@ -93,14 +102,20 @@ NEFillBorderKernel::NEFillBorderKernel()
{
}
-void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void NEFillBorderKernel::configure(ITensor *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
_tensor = tensor;
configure(tensor->info(), border_size, border_mode, constant_border_value);
}
-void NEFillBorderKernel::configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void NEFillBorderKernel::configure(ITensorInfo *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
//Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
@@ -124,7 +139,7 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_UNUSED(info);
// If there is no border: early exit
- if(_border_size.empty())
+ if (_border_size.empty())
{
return;
}
@@ -132,13 +147,14 @@ void NEFillBorderKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- switch(_mode)
+ switch (_mode)
{
case BorderMode::CONSTANT:
{
- if(_border_size.left == 1 && _border_size.top == 1 && _tensor->info()->data_type() == DataType::F32)
+ if (_border_size.left == 1 && _border_size.top == 1 && _tensor->info()->data_type() == DataType::F32)
{
- fill_constant_value_single_channel_special(_tensor, window, _border_size.right, _border_size.bottom, _constant_border_value);
+ fill_constant_value_single_channel_special(_tensor, window, _border_size.right, _border_size.bottom,
+ _constant_border_value);
}
else
{
@@ -176,46 +192,56 @@ void NEFillBorderKernel::fill_replicate_single_channel(const Window &window)
Iterator vertical_it(_tensor, vertical);
- execute_window_loop(vertical, [&](const Coordinates &)
- {
- uint8_t *base_addr = start_valid_region + vertical_it.offset();
- // Fill left and right borders
- for(unsigned int i = 0; i < _border_size.left; ++i)
+ execute_window_loop(
+ vertical,
+ [&](const Coordinates &)
{
- std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, vertical_it.ptr(), element_size);
- }
+ uint8_t *base_addr = start_valid_region + vertical_it.offset();
+ // Fill left and right borders
+ for (unsigned int i = 0; i < _border_size.left; ++i)
+ {
+ std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, vertical_it.ptr(),
+ element_size);
+ }
- for(unsigned int i = 0; i < _border_size.right; ++i)
- {
- std::memcpy(base_addr + (width + i) * element_size, vertical_it.ptr() + (width - 1) * element_size, element_size);
- }
- },
- vertical_it);
+ for (unsigned int i = 0; i < _border_size.right; ++i)
+ {
+ std::memcpy(base_addr + (width + i) * element_size, vertical_it.ptr() + (width - 1) * element_size,
+ element_size);
+ }
+ },
+ vertical_it);
// Top and bottom border
Iterator plane_it(_tensor, window);
// Iterate over all XY planes
- execute_window_loop(window, [&](const Coordinates &)
- {
- uint8_t *base_addr = start_valid_region + plane_it.offset();
- // Top border
- for(int i = -_border_size.top; i < 0; ++i)
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
{
- // Copy top rows including left/right borders
- std::memcpy(base_addr + i * static_cast<int>(_tensor->info()->strides_in_bytes()[1]) - _border_size.left * element_size,
- base_addr - _border_size.left * element_size, (_border_size.left + width + _border_size.right) * element_size);
- }
+ uint8_t *base_addr = start_valid_region + plane_it.offset();
+ // Top border
+ for (int i = -_border_size.top; i < 0; ++i)
+ {
+ // Copy top rows including left/right borders
+ std::memcpy(base_addr + i * static_cast<int>(_tensor->info()->strides_in_bytes()[1]) -
+ _border_size.left * element_size,
+ base_addr - _border_size.left * element_size,
+ (_border_size.left + width + _border_size.right) * element_size);
+ }
- // Bottom border
- for(unsigned int i = height; i < height + _border_size.bottom; ++i)
- {
- // Copy bottom rows including left/right borders
- std::memcpy(base_addr + i * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size,
- base_addr + (height - 1) * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size, (_border_size.left + width + _border_size.right) * element_size);
- }
- },
- plane_it);
+ // Bottom border
+ for (unsigned int i = height; i < height + _border_size.bottom; ++i)
+ {
+ // Copy bottom rows including left/right borders
+ std::memcpy(base_addr + i * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size,
+ base_addr + (height - 1) * _tensor->info()->strides_in_bytes()[1] -
+ _border_size.left * element_size,
+ (_border_size.left + width + _border_size.right) * element_size);
+ }
+ },
+ plane_it);
}
void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window)
@@ -232,50 +258,57 @@ void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window
Iterator vertical_it(_tensor, vertical);
- execute_window_loop(vertical, [&](const Coordinates &)
- {
- uint8_t *base_addr = start_valid_region + vertical_it.offset();
- // Fill left and right borders
- for(unsigned int i = 0; i < _border_size.left; ++i)
+ execute_window_loop(
+ vertical,
+ [&](const Coordinates &)
{
- std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, &_constant_border_value, element_size);
- }
+ uint8_t *base_addr = start_valid_region + vertical_it.offset();
+ // Fill left and right borders
+ for (unsigned int i = 0; i < _border_size.left; ++i)
+ {
+ std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, &_constant_border_value,
+ element_size);
+ }
- for(unsigned int i = 0; i < _border_size.right; ++i)
- {
- std::memcpy(base_addr + (width + i) * element_size, &_constant_border_value, element_size);
- }
- },
- vertical_it);
+ for (unsigned int i = 0; i < _border_size.right; ++i)
+ {
+ std::memcpy(base_addr + (width + i) * element_size, &_constant_border_value, element_size);
+ }
+ },
+ vertical_it);
// Top and bottom border
Iterator plane_it(_tensor, window);
// Iterate over all XY planes
- execute_window_loop(window, [&](const Coordinates &)
- {
- uint8_t *base_addr = start_valid_region + plane_it.offset();
- // Top border
- for(int i = -_border_size.top; i < 0; ++i)
+ execute_window_loop(
+ window,
+ [&](const Coordinates &)
{
- // Fill top rows including left/right borders
- for(unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+ uint8_t *base_addr = start_valid_region + plane_it.offset();
+ // Top border
+ for (int i = -_border_size.top; i < 0; ++i)
{
- std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size, &_constant_border_value, element_size);
+ // Fill top rows including left/right borders
+ for (unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+ {
+ std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size,
+ &_constant_border_value, element_size);
+ }
}
- }
- // Bottom border
- const unsigned low_border_size = height + _border_size.bottom;
- for(unsigned int i = height; i < low_border_size; ++i)
- {
- // Fill bottom rows including left/right borders
- for(unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+ // Bottom border
+ const unsigned low_border_size = height + _border_size.bottom;
+ for (unsigned int i = height; i < low_border_size; ++i)
{
- std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size, &_constant_border_value, element_size);
+ // Fill bottom rows including left/right borders
+ for (unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+ {
+ std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size,
+ &_constant_border_value, element_size);
+ }
}
- }
- },
- plane_it);
+ },
+ plane_it);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.h b/src/core/NEON/kernels/NEFillBorderKernel.h
index 2c851583ed..aaad108bfa 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.h
+++ b/src/core/NEON/kernels/NEFillBorderKernel.h
@@ -26,6 +26,7 @@
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
@@ -64,7 +65,10 @@ public:
* @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
*
*/
- void configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+ void configure(ITensor *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value = PixelValue());
/** Initialise the function.
*
* @note This kernel fills the borders within the XY-planes.
@@ -75,7 +79,10 @@ public:
* @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
*
*/
- void configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+ void configure(ITensorInfo *tensor,
+ BorderSize border_size,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value = PixelValue());
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
index 51a69046a9..cbe5136fb1 100644
--- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
@@ -22,7 +22,6 @@
* SOFTWARE.
*/
#include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
-#include "src/cpu/kernels/fuse_batch_normalization/list.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
@@ -30,12 +29,14 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
#include "src/common/cpuinfo/CpuIsaInfo.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/fuse_batch_normalization/list.h"
#include <map>
@@ -52,8 +53,16 @@ struct FuseBatchNormalizeSelectorData
};
using FBNSelectorPtr = std::add_pointer<bool(const FuseBatchNormalizeSelectorData &data)>::type;
-using FBNUKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, ITensor *,
- const ITensor *, const ITensor *, const ITensor *, const ITensor *, float, const Window &)>::type;
+using FBNUKernelPtr = std::add_pointer<void(const ITensor *,
+ const ITensor *,
+ ITensor *,
+ ITensor *,
+ const ITensor *,
+ const ITensor *,
+ const ITensor *,
+ const ITensor *,
+ float,
+ const Window &)>::type;
struct FBNUKernel
{
@@ -62,73 +71,63 @@ struct FBNUKernel
FBNUKernelPtr ukernel;
};
-static const FBNUKernel available_kernels[] =
-{
- {
- "fused_batch_normalization_conv_NHWC_F16",
- [](const FuseBatchNormalizeSelectorData & data)
- {
- return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
- },
- REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)
- },
- {
- "fused_batch_normalization_conv_NCHW_F16",
- [](const FuseBatchNormalizeSelectorData & data)
- {
- return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
- },
- REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)
- },
- {
- "fused_batch_normalization_dwc_NHWC_F16",
- [](const FuseBatchNormalizeSelectorData & data)
- {
- return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
- },
- REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f16)
- },
- {
- "fused_batch_normalization_dwc_NCHW_F16",
- [](const FuseBatchNormalizeSelectorData & data)
- {
- return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
- },
- REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f16)
- },
- {
- "fused_batch_normalization_conv_NHWC_F32",
- [](const FuseBatchNormalizeSelectorData & data)
- {
- return data.dt == DataType::F32 && data.dl == DataLayout::NHWC && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
- },
- REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)
- },
- {
- "fused_batch_normalization_conv_NCHW_F32",
- [](const FuseBatchNormalizeSelectorData & data)
- {
- return data.dt == DataType::F32 && data.dl == DataLayout::NCHW && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
- },
- REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)
- },
- {
- "fused_batch_normalization_dwc_NHWC_F32",
- [](const FuseBatchNormalizeSelectorData & data)
- {
- return data.dt == DataType::F32 && data.dl == DataLayout::NHWC && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
- },
- REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f32)
- },
- {
- "fused_batch_normalization_dwc_NCHW_F32",
- [](const FuseBatchNormalizeSelectorData & data)
- {
- return data.dt == DataType::F32 && data.dl == DataLayout::NCHW && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
- },
- REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f32)
- }
-};
+static const FBNUKernel available_kernels[] = {
+ {"fused_batch_normalization_conv_NHWC_F16",
+ [](const FuseBatchNormalizeSelectorData &data)
+ {
+ return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 &&
+ data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+ },
+ REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)},
+ {"fused_batch_normalization_conv_NCHW_F16",
+ [](const FuseBatchNormalizeSelectorData &data)
+ {
+ return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 &&
+ data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+ },
+ REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)},
+ {"fused_batch_normalization_dwc_NHWC_F16",
+ [](const FuseBatchNormalizeSelectorData &data)
+ {
+ return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 &&
+ data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+ },
+ REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f16)},
+ {"fused_batch_normalization_dwc_NCHW_F16",
+ [](const FuseBatchNormalizeSelectorData &data)
+ {
+ return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 &&
+ data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+ },
+ REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f16)},
+ {"fused_batch_normalization_conv_NHWC_F32",
+ [](const FuseBatchNormalizeSelectorData &data)
+ {
+ return data.dt == DataType::F32 && data.dl == DataLayout::NHWC &&
+ data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+ },
+ REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)},
+ {"fused_batch_normalization_conv_NCHW_F32",
+ [](const FuseBatchNormalizeSelectorData &data)
+ {
+ return data.dt == DataType::F32 && data.dl == DataLayout::NCHW &&
+ data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+ },
+ REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)},
+ {"fused_batch_normalization_dwc_NHWC_F32",
+ [](const FuseBatchNormalizeSelectorData &data)
+ {
+ return data.dt == DataType::F32 && data.dl == DataLayout::NHWC &&
+ data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+ },
+ REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f32)},
+ {"fused_batch_normalization_dwc_NCHW_F32",
+ [](const FuseBatchNormalizeSelectorData &data)
+ {
+ return data.dt == DataType::F32 && data.dl == DataLayout::NCHW &&
+ data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+ },
+ REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f32)}};
/** Micro-kernel selector
*
@@ -140,9 +139,9 @@ static const FBNUKernel available_kernels[] =
*/
const FBNUKernel *get_implementation(const FuseBatchNormalizeSelectorData &data)
{
- for(const auto &uk : available_kernels)
+ for (const auto &uk : available_kernels)
{
- if(uk.is_selected(data))
+ if (uk.is_selected(data))
{
return &uk;
}
@@ -150,10 +149,16 @@ const FBNUKernel *get_implementation(const FuseBatchNormalizeSelectorData &data)
return nullptr;
}
-Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
- const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
- const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+Status validate_arguments(const ITensorInfo *input_weights,
+ const ITensorInfo *bn_mean,
+ const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights,
+ const ITensorInfo *fused_bias,
+ const ITensorInfo *input_bias,
+ const ITensorInfo *bn_beta,
+ const ITensorInfo *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
ARM_COMPUTE_UNUSED(epsilon);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
@@ -164,43 +169,44 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
ARM_COMPUTE_RETURN_ERROR_ON(input_bias == nullptr && fused_bias == nullptr);
ARM_COMPUTE_RETURN_ERROR_ON(bn_mean->num_dimensions() > 1);
- if(fbn_type == FuseBatchNormalizationType::CONVOLUTION)
+ if (fbn_type == FuseBatchNormalizationType::CONVOLUTION)
{
ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(3) != bn_mean->dimension(0));
}
else
{
- const size_t channel_idx = get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
+ const size_t channel_idx =
+ get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(channel_idx) != bn_mean->dimension(0));
}
// Validate bias
- if(input_bias != nullptr)
+ if (input_bias != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, input_bias);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, input_bias);
}
// Validate beta
- if(bn_beta != nullptr)
+ if (bn_beta != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_beta);
}
// Validate gamma
- if(bn_gamma != nullptr)
+ if (bn_gamma != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_gamma);
}
// Validate output weights
- if(fused_weights != nullptr && fused_weights->total_size() != 0)
+ if (fused_weights != nullptr && fused_weights->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_weights, fused_weights);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input_weights, fused_weights);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_weights);
}
// Validate output bias
- if(fused_bias != nullptr && fused_bias->total_size() != 0)
+ if (fused_bias != nullptr && fused_bias->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_bias);
@@ -212,15 +218,31 @@ Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *b
} // namespace
NEFuseBatchNormalizationKernel::NEFuseBatchNormalizationKernel()
- : _input_weights(nullptr), _input_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(),
- _run_in_place_weights(false), _run_in_place_bias(false), _func(nullptr)
+ : _input_weights(nullptr),
+ _input_bias(nullptr),
+ _bn_mean(nullptr),
+ _bn_var(nullptr),
+ _bn_gamma(nullptr),
+ _bn_beta(nullptr),
+ _fused_weights(nullptr),
+ _fused_bias(nullptr),
+ _epsilon(),
+ _run_in_place_weights(false),
+ _run_in_place_bias(false),
+ _func(nullptr)
{
}
-void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var,
- ITensor *fused_weights, ITensor *fused_bias,
- const ITensor *input_bias, const ITensor *bn_beta, const ITensor *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights,
+ const ITensor *bn_mean,
+ const ITensor *bn_var,
+ ITensor *fused_weights,
+ ITensor *fused_bias,
+ const ITensor *input_bias,
+ const ITensor *bn_beta,
+ const ITensor *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
@@ -238,27 +260,27 @@ void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, con
_run_in_place_bias = (fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias);
// Auto initialize outputs
- if(_fused_weights != nullptr)
+ if (_fused_weights != nullptr)
{
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*_fused_weights->info(), *_input_weights->info()->clone());
}
- if(_fused_bias != nullptr)
+ if (_fused_bias != nullptr)
{
// Output tensor auto initialization if not yet initialized
auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone());
}
// Validate arguments
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_weights->info(), bn_mean->info(), bn_var->info(),
- (fused_weights != nullptr) ? fused_weights->info() : nullptr,
- (fused_bias != nullptr) ? fused_bias->info() : nullptr,
- (input_bias != nullptr) ? input_bias->info() : nullptr,
- (bn_beta != nullptr) ? bn_beta->info() : nullptr,
- (bn_gamma != nullptr) ? bn_gamma->info() : nullptr,
- epsilon, fbn_type));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+ input_weights->info(), bn_mean->info(), bn_var->info(),
+ (fused_weights != nullptr) ? fused_weights->info() : nullptr,
+ (fused_bias != nullptr) ? fused_bias->info() : nullptr, (input_bias != nullptr) ? input_bias->info() : nullptr,
+ (bn_beta != nullptr) ? bn_beta->info() : nullptr, (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, epsilon,
+ fbn_type));
- const auto *uk = get_implementation(FuseBatchNormalizeSelectorData{ input_weights->info()->data_type(), input_weights->info()->data_layout(), fbn_type, CPUInfo::get().get_isa() });
+ const auto *uk = get_implementation(FuseBatchNormalizeSelectorData{
+ input_weights->info()->data_type(), input_weights->info()->data_layout(), fbn_type, CPUInfo::get().get_isa()});
ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
_func = uk->ukernel;
@@ -268,12 +290,19 @@ void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, con
INEKernel::configure(win);
}
-Status NEFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
- const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
- const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+Status NEFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights,
+ const ITensorInfo *bn_mean,
+ const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights,
+ const ITensorInfo *fused_bias,
+ const ITensorInfo *input_bias,
+ const ITensorInfo *bn_beta,
+ const ITensorInfo *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+ input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
return Status{};
}
@@ -284,6 +313,7 @@ void NEFuseBatchNormalizationKernel::run(const Window &window, const ThreadInfo
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
- (*_func)(_input_weights, _input_bias, _fused_weights, _fused_bias, _bn_mean, _bn_var, _bn_beta, _bn_gamma, _epsilon, window);
+ (*_func)(_input_weights, _input_bias, _fused_weights, _fused_bias, _bn_mean, _bn_var, _bn_beta, _bn_gamma, _epsilon,
+ window);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
index ee767b01c8..f23280d55a 100644
--- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
+++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
@@ -66,9 +66,16 @@ public:
* @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
* @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
*/
- void configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, ITensor *fused_weights, ITensor *fused_bias,
- const ITensor *input_bias = nullptr, const ITensor *bn_beta = nullptr, const ITensor *bn_gamma = nullptr,
- float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+ void configure(const ITensor *input_weights,
+ const ITensor *bn_mean,
+ const ITensor *bn_var,
+ ITensor *fused_weights,
+ ITensor *fused_bias,
+ const ITensor *input_bias = nullptr,
+ const ITensor *bn_beta = nullptr,
+ const ITensor *bn_gamma = nullptr,
+ float epsilon = 0.001f,
+ FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
/** Static function to check if given info will lead to a valid configuration of @ref NEFuseBatchNormalizationKernel
*
* @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -86,10 +93,16 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
- const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
- const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
- float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+ static Status validate(const ITensorInfo *input_weights,
+ const ITensorInfo *bn_mean,
+ const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights,
+ const ITensorInfo *fused_bias,
+ const ITensorInfo *input_bias = nullptr,
+ const ITensorInfo *bn_beta = nullptr,
+ const ITensorInfo *bn_gamma = nullptr,
+ float epsilon = 0.001f,
+ FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
@@ -107,8 +120,16 @@ private:
bool _run_in_place_weights;
bool _run_in_place_bias;
- using FuseBatchNormFunction = void(const ITensor *input_weights, const ITensor *input_bias, ITensor *fused_weights, ITensor *fused_bias,
- const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window);
+ using FuseBatchNormFunction = void(const ITensor *input_weights,
+ const ITensor *input_bias,
+ ITensor *fused_weights,
+ ITensor *fused_bias,
+ const ITensor *bn_mean,
+ const ITensor *bn_var,
+ const ITensor *bn_beta,
+ const ITensor *bn_gamma,
+ float epsilon,
+ const Window &window);
FuseBatchNormFunction *_func;
};
diff --git a/src/core/NEON/kernels/NEGatherKernel.cpp b/src/core/NEON/kernels/NEGatherKernel.cpp
index 11332ffac8..f1d457d399 100644
--- a/src/core/NEON/kernels/NEGatherKernel.cpp
+++ b/src/core/NEON/kernels/NEGatherKernel.cpp
@@ -27,9 +27,10 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -42,20 +43,22 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices,
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
- if(axis < 0)
+ if (axis < 0)
{
axis += input->num_dimensions();
}
ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions()));
- ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > Coordinates::num_max_dimensions);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 >
+ Coordinates::num_max_dimensions);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), axis);
+ TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+ input->tensor_shape(), indices->tensor_shape(), axis);
ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
}
@@ -81,23 +84,23 @@ void NEGatherKernel::gather_common(const Window &window, const ThreadInfo &info)
const auto idx_info = _indices->info();
const auto dst_info = _output->info();
- const auto num_dims = dst_info->num_dimensions();
+ const auto num_dims = dst_info->num_dimensions();
const auto chunk_stride = src_info->strides_in_bytes()[_axis];
const auto window_start_x = window.x().start();
- const auto window_end_x = window.x().end();
- auto window_size_x = src_info->element_size();
+ const auto window_end_x = window.x().end();
+ auto window_size_x = src_info->element_size();
const auto idx_limit = static_cast<TIndex>(src_info->tensor_shape()[_axis]);
- if(_axis != 0)
+ if (_axis != 0)
{
dst_win.set(0, Window::Dimension(window_start_x, window_start_x + 1, 1));
window_size_x *= window_end_x - window_start_x;
}
// Compute source and index tensors window based on the output window.
- auto src_win = dst_win;
+ auto src_win = dst_win;
Window idx_win;
for (size_t i = 0; i < idx_info->num_dimensions(); ++i)
@@ -109,22 +112,27 @@ void NEGatherKernel::gather_common(const Window &window, const ThreadInfo &info)
// Use the custom strides to access all three tensors using the same loop.
Iterator src_it(num_dims, _src_it_strides, _input->buffer(), src_info->offset_first_element_in_bytes(), src_win);
Iterator idx_it(num_dims, _idx_it_strides, _indices->buffer(), idx_info->offset_first_element_in_bytes(), idx_win);
- Iterator dst_it(num_dims, dst_info->strides_in_bytes(), _output->buffer(), dst_info->offset_first_element_in_bytes(), dst_win);
-
- execute_window_loop(dst_win, [&](const Coordinates &) {
- const auto idx = *reinterpret_cast<const TIndex *>(idx_it.ptr());
-
- if(idx >= 0 && idx < idx_limit)
- {
- const auto src_ptr = src_it.ptr() + idx * chunk_stride;
+ Iterator dst_it(num_dims, dst_info->strides_in_bytes(), _output->buffer(),
+ dst_info->offset_first_element_in_bytes(), dst_win);
- std::copy_n(src_ptr, window_size_x, dst_it.ptr());
- }
- else
+ execute_window_loop(
+ dst_win,
+ [&](const Coordinates &)
{
- std::fill_n(dst_it.ptr(), window_size_x, 0);
- }
- }, src_it, idx_it, dst_it);
+ const auto idx = *reinterpret_cast<const TIndex *>(idx_it.ptr());
+
+ if (idx >= 0 && idx < idx_limit)
+ {
+ const auto src_ptr = src_it.ptr() + idx * chunk_stride;
+
+ std::copy_n(src_ptr, window_size_x, dst_it.ptr());
+ }
+ else
+ {
+ std::fill_n(dst_it.ptr(), window_size_x, 0);
+ }
+ },
+ src_it, idx_it, dst_it);
}
void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
@@ -137,13 +145,13 @@ void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITe
_output = output;
_axis = axis;
- if(_axis < 0)
+ if (_axis < 0)
{
_axis += input->info()->num_dimensions();
}
ARM_COMPUTE_ERROR_ON(0 > _axis || _axis >= static_cast<int32_t>(input->info()->num_dimensions()));
- switch(_indices->info()->data_type())
+ switch (_indices->info()->data_type())
{
case DataType::U32:
_func = &NEGatherKernel::gather_common<uint32_t>;
@@ -157,7 +165,8 @@ void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITe
}
// Output auto initialization if not yet initialized
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+ input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
// Create window
@@ -169,30 +178,31 @@ void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITe
// These will be used to iterate lock-step through all tensors (input, indices and output).
size_t dim_no = 0;
- const auto input_info = input->info();
+ const auto input_info = input->info();
const auto &input_strides = input_info->strides_in_bytes();
- const auto indices_info = indices->info();
- const auto &indices_strides = indices_info->strides_in_bytes();
- const auto indices_num_dims = indices_info->num_dimensions();
+ const auto indices_info = indices->info();
+ const auto &indices_strides = indices_info->strides_in_bytes();
+ const auto indices_num_dims = indices_info->num_dimensions();
- for(; dim_no < static_cast<size_t>(_axis); ++dim_no)
+ for (; dim_no < static_cast<size_t>(_axis); ++dim_no)
{
_src_it_strides[dim_no] = input_strides[dim_no];
}
- for(; dim_no < static_cast<size_t>(_axis) + indices_num_dims; ++dim_no)
+ for (; dim_no < static_cast<size_t>(_axis) + indices_num_dims; ++dim_no)
{
_idx_it_strides[dim_no] = indices_strides[dim_no - _axis];
}
- for(; dim_no < Coordinates::num_max_dimensions; ++dim_no)
+ for (; dim_no < Coordinates::num_max_dimensions; ++dim_no)
{
_src_it_strides[dim_no] = input_strides[dim_no - indices_num_dims + 1];
}
}
-Status NEGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+Status
+NEGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis));
return Status{};
diff --git a/src/core/NEON/kernels/NEGatherKernel.h b/src/core/NEON/kernels/NEGatherKernel.h
index ce69daeda7..b8c069f99e 100644
--- a/src/core/NEON/kernels/NEGatherKernel.h
+++ b/src/core/NEON/kernels/NEGatherKernel.h
@@ -26,6 +26,7 @@
#define ARM_COMPUTE_NEGATHERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
@@ -92,8 +93,8 @@ private:
ITensor *_output;
kernel_ptr _func;
- Strides _src_it_strides;
- Strides _idx_it_strides;
+ Strides _src_it_strides;
+ Strides _idx_it_strides;
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_NEGATHERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
index 7bba136e84..549319e49f 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
@@ -27,11 +27,13 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/cpu/kernels/genproposals/list.h"
+
#include <arm_neon.h>
namespace arm_compute
@@ -44,7 +46,8 @@ struct ComputeAllAnchorsData
};
using ComputeAllAnchorsSelectorPtr = std::add_pointer<bool(const ComputeAllAnchorsData &data)>::type;
-using ComputeAllAnchorsUKernelPtr = std::add_pointer<void(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)>::type;
+using ComputeAllAnchorsUKernelPtr = std::add_pointer<void(
+ const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)>::type;
struct ComputeAllAnchorsKernel
{
@@ -53,27 +56,17 @@ struct ComputeAllAnchorsKernel
ComputeAllAnchorsUKernelPtr ukernel;
};
-static const ComputeAllAnchorsKernel available_kernels[] =
-{
+static const ComputeAllAnchorsKernel available_kernels[] = {
#if defined(ARM_COMPUTE_ENABLE_NEON)
- {
- "neon_qu16_computeallanchors",
- [](const ComputeAllAnchorsData & data) { return data.dt == DataType::QSYMM16; },
- REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors)
- },
+ {"neon_qu16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::QSYMM16; },
+ REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors)},
#endif //defined(ARM_COMPUTE_ENABLE_NEON)
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- {
- "neon_fp16_computeallanchors",
- [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F16; },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors)
- },
+ {"neon_fp16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors)},
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- {
- "neon_fp32_computeallanchors",
- [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors)
- },
+ {"neon_fp32_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors)},
};
/** Micro-kernel selector
@@ -84,9 +77,9 @@ static const ComputeAllAnchorsKernel available_kernels[] =
*/
const ComputeAllAnchorsKernel *get_implementation(const ComputeAllAnchorsData &data)
{
- for(const auto &uk : available_kernels)
+ for (const auto &uk : available_kernels)
{
- if(uk.is_selected(data))
+ if (uk.is_selected(data))
{
return &uk;
}
@@ -101,7 +94,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi());
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::QSYMM16, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2);
- if(all_anchors->total_size() > 0)
+ if (all_anchors->total_size() > 0)
{
const size_t feature_height = info.feat_height();
const size_t feature_width = info.feat_width();
@@ -111,7 +104,7 @@ Status validate_arguments(const ITensorInfo *anchors, const ITensorInfo *all_anc
ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(0) != info.values_per_roi());
ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(1) != feature_height * feature_width * num_anchors);
- if(is_data_type_quantized(anchors->data_type()))
+ if (is_data_type_quantized(anchors->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(anchors, all_anchors);
}
@@ -139,7 +132,8 @@ void NEComputeAllAnchorsKernel::configure(const ITensor *anchors, ITensor *all_a
// Initialize the output if empty
const TensorShape output_shape(info.values_per_roi(), width * height * num_anchors);
- auto_init_if_empty(*all_anchors->info(), TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
+ auto_init_if_empty(*all_anchors->info(),
+ TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
// Set instance variables
_anchors = anchors;
@@ -151,7 +145,9 @@ void NEComputeAllAnchorsKernel::configure(const ITensor *anchors, ITensor *all_a
INEKernel::configure(win);
}
-Status NEComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+Status NEComputeAllAnchorsKernel::validate(const ITensorInfo *anchors,
+ const ITensorInfo *all_anchors,
+ const ComputeAnchorsInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(anchors, all_anchors, info));
return Status{};
@@ -163,7 +159,7 @@ void NEComputeAllAnchorsKernel::run(const Window &window, const ThreadInfo &info
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- const auto *uk = get_implementation(ComputeAllAnchorsData{ _anchors->info()->data_type() });
+ const auto *uk = get_implementation(ComputeAllAnchorsData{_anchors->info()->data_type()});
ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
uk->ukernel(_anchors, _all_anchors, _anchors_info, window);
diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
index 297d6d4abe..30699eee01 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
@@ -78,5 +78,5 @@ private:
ITensor *_all_anchors;
ComputeAnchorsInfo _anchors_info;
};
-} // arm_compute
+} // namespace arm_compute
#endif // ARM_COMPUTE_NEGENERATEPROPOSALSLAYERKERNEL_H
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
index 71641404bf..0a1780f6ee 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
@@ -31,12 +31,13 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include "src/cpu/kernels/instancenorm/list.h"
#include <arm_neon.h>
@@ -51,7 +52,13 @@ struct InstanceNormSelectorData
};
using InstanceNormSelctorPtr = std::add_pointer<bool(const InstanceNormSelectorData &data)>::type;
-using InstanceNormUKernelPtr = std::add_pointer<void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)>::type;
+using InstanceNormUKernelPtr = std::add_pointer<void(ITensor *input,
+ ITensor *output,
+ float gamma,
+ float beta,
+ float epsilon,
+ bool use_mixed_precision,
+ const Window &window)>::type;
struct InstanceNormKernel
{
@@ -60,19 +67,12 @@ struct InstanceNormKernel
InstanceNormUKernelPtr ukernel;
};
-static const InstanceNormKernel available_kernels[] =
-{
- {
- "fp32_neon_instancenorm",
- [](const InstanceNormSelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm)
- },
+static const InstanceNormKernel available_kernels[] = {
+ {"fp32_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm)},
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- {
- "fp16_neon_instancenorm",
- [](const InstanceNormSelectorData & data) { return data.dt == DataType::F16; },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm)
- },
+ {"fp16_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm)},
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
};
@@ -84,9 +84,9 @@ static const InstanceNormKernel available_kernels[] =
*/
const InstanceNormKernel *get_implementation(const InstanceNormSelectorData &data)
{
- for(const auto &uk : available_kernels)
+ for (const auto &uk : available_kernels)
{
- if(uk.is_selected(data))
+ if (uk.is_selected(data))
{
return &uk;
}
@@ -102,14 +102,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0");
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC, "NHWC data layout is not supported by the kernel directly");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC,
+ "NHWC data layout is not supported by the kernel directly");
- if(output != nullptr && output->total_size() != 0)
+ if (output != nullptr && output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+ "Input and output have different number of channels");
}
return Status{};
}
@@ -132,7 +134,9 @@ NEInstanceNormalizationLayerKernel::NEInstanceNormalizationLayerKernel()
{
}
-void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *output, const InstanceNormalizationLayerKernelInfo &info)
+void NEInstanceNormalizationLayerKernel::configure(ITensor *input,
+ ITensor *output,
+ const InstanceNormalizationLayerKernelInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
@@ -152,10 +156,13 @@ void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *outp
INEKernel::configure(std::get<1>(win_config));
}
-Status NEInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
+Status NEInstanceNormalizationLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const InstanceNormalizationLayerKernelInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info.gamma, info.beta, info.epsilon));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
+ ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
+ input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
return Status{};
}
@@ -165,7 +172,7 @@ void NEInstanceNormalizationLayerKernel::run(const Window &window, const ThreadI
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- const auto *uk = get_implementation(InstanceNormSelectorData{ _input->info()->data_type() });
+ const auto *uk = get_implementation(InstanceNormSelectorData{_input->info()->data_type()});
ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
uk->ukernel(_input, _output, _gamma, _beta, _epsilon, _use_mixed_precision, window);
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
index f166ce2058..024ccd9ef2 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
@@ -68,7 +68,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
@@ -82,14 +83,15 @@ private:
* @param[in] beta The offset scalar value applied to the normalized tensor. Defaults to 0.0
* @param[in] epsilon Lower bound value for the normalization. Defaults to 1e-12
*/
- using NormalizationFunction = void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
+ using NormalizationFunction =
+ void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
ITensor *_input;
ITensor *_output;
float _gamma;
float _beta;
float _epsilon;
- bool _use_mixed_precision{ true };
+ bool _use_mixed_precision{true};
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
index 8ab0288ab1..eea57a17d3 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
@@ -30,11 +30,12 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
#include "src/common/cpuinfo/CpuIsaInfo.h"
-#include "src/core/NEON/NEMath.h"
#include "src/core/common/Registrars.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEMath.h"
#include "src/cpu/kernels/l2normlayer/list.h"
#include <arm_neon.h>
@@ -55,7 +56,8 @@ struct L2NormalizeLayerSelectorData
using L2NormalizeLayerKernelSelctorPtr = std::add_pointer<bool(const L2NormalizeLayerSelectorData &data)>::type;
-using L2NormalizeLayerPtr = std::add_pointer<void(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)>::type;
+using L2NormalizeLayerPtr = std::add_pointer<void(
+ const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)>::type;
struct L2NormalizeLayerKernel
{
@@ -64,26 +66,25 @@ struct L2NormalizeLayerKernel
L2NormalizeLayerPtr ukernel;
};
-static const L2NormalizeLayerKernel available_kernels[] =
-{
- {
- "fp32_neon_l2normalize_x",
- [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F32 && data.actual_axis == Window::DimX; },
- REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_x)
- },
- {
- "fp32_neon_l2normalize_yz",
- [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F32 && data.actual_axis != Window::DimX; },
- REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_yz)
- },
+static const L2NormalizeLayerKernel available_kernels[] = {
+ {"fp32_neon_l2normalize_x",
+ [](const L2NormalizeLayerSelectorData &data)
+ { return data.dt == DataType::F32 && data.actual_axis == Window::DimX; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_x)},
+ {"fp32_neon_l2normalize_yz",
+ [](const L2NormalizeLayerSelectorData &data)
+ { return data.dt == DataType::F32 && data.actual_axis != Window::DimX; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_yz)},
{
"fp16_neon_l2normalize_x",
- [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis == Window::DimX; },
+ [](const L2NormalizeLayerSelectorData &data)
+ { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis == Window::DimX; },
REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_l2_normalize_x),
},
{
"fp16_neon_l2normalize_yz",
- [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis != Window::DimX; },
+ [](const L2NormalizeLayerSelectorData &data)
+ { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis != Window::DimX; },
REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_l2_normalize_yz),
},
};
@@ -96,9 +97,9 @@ static const L2NormalizeLayerKernel available_kernels[] =
*/
const L2NormalizeLayerKernel *get_implementation(const L2NormalizeLayerSelectorData &data)
{
- for(const auto &uk : available_kernels)
+ for (const auto &uk : available_kernels)
{
- if(uk.is_selected(data))
+ if (uk.is_selected(data))
{
return &uk;
}
@@ -106,7 +107,8 @@ const L2NormalizeLayerKernel *get_implementation(const L2NormalizeLayerSelectorD
return nullptr;
}
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
{
ARM_COMPUTE_UNUSED(epsilon);
@@ -115,14 +117,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis > 2, "Actual axis greater than 2 is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, "Actual normalization axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions,
+ "Actual normalization axis greater than max number of dimensions");
// Reduce shape on axis
TensorShape sum_shape = input->tensor_shape();
sum_shape.set(actual_axis, 1);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape);
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -151,7 +154,8 @@ NEL2NormalizeLayerKernel::NEL2NormalizeLayerKernel()
{
}
-void NEL2NormalizeLayerKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output, int axis, float epsilon)
+void NEL2NormalizeLayerKernel::configure(
+ const ITensor *input, const ITensor *sum, ITensor *output, int axis, float epsilon)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon));
@@ -169,10 +173,12 @@ void NEL2NormalizeLayerKernel::configure(const ITensor *input, const ITensor *su
INEKernel::configure(std::get<1>(win_config));
}
-Status NEL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status NEL2NormalizeLayerKernel::validate(
+ const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon));
- ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
return Status{};
}
@@ -183,12 +189,13 @@ void NEL2NormalizeLayerKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- if(_actual_axis > 2)
+ if (_actual_axis > 2)
{
ARM_COMPUTE_ERROR("Unsupported normalization axis");
}
- const auto *uk = get_implementation(L2NormalizeLayerSelectorData{ _output->info()->data_type(), _actual_axis, CPUInfo::get().get_isa() });
+ const auto *uk = get_implementation(
+ L2NormalizeLayerSelectorData{_output->info()->data_type(), _actual_axis, CPUInfo::get().get_isa()});
ARM_COMPUTE_ERROR_ON(uk == nullptr);
ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h
index af3ad3403e..3524e66a21 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h
@@ -74,7 +74,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NELogicalKernel.cpp b/src/core/NEON/kernels/NELogicalKernel.cpp
index 6939e08ef0..6be6284528 100644
--- a/src/core/NEON/kernels/NELogicalKernel.cpp
+++ b/src/core/NEON/kernels/NELogicalKernel.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Validate.h"
+
#include "src/common/utils/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -50,7 +51,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, ui
ARM_COMPUTE_ASSERT_NOT_NULLPTR(src1);
ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
- for(; len >= step; len -= step)
+ for (; len >= step; len -= step)
{
vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16)));
src0 += step;
@@ -58,7 +59,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, ui
dst += step;
}
- for(; len >= half_step; len -= half_step)
+ for (; len >= half_step; len -= half_step)
{
vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8)));
src0 += half_step;
@@ -66,7 +67,7 @@ void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, ui
dst += half_step;
}
- for(; len > 0; --len)
+ for (; len > 0; --len)
{
*dst = (*src0) && (*src1);
++src0;
@@ -84,21 +85,21 @@ void neon_logical_and_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8
const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s);
const auto broadcast_val_clamped_x8 = vdup_n_u8(broadcast_val_clamped_s);
- for(; len >= step; len -= step)
+ for (; len >= step; len -= step)
{
vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16));
src += step;
dst += step;
}
- for(; len >= half_step; len -= half_step)
+ for (; len >= half_step; len -= half_step)
{
vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8));
src += half_step;
dst += half_step;
}
- for(; len > 0; --len)
+ for (; len > 0; --len)
{
*dst = (*src) && broadcast_val_clamped_s;
++src;
@@ -112,7 +113,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uin
ARM_COMPUTE_ASSERT_NOT_NULLPTR(src1);
ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
- for(; len >= step; len -= step)
+ for (; len >= step; len -= step)
{
vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16)));
src0 += step;
@@ -120,7 +121,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uin
dst += step;
}
- for(; len >= half_step; len -= half_step)
+ for (; len >= half_step; len -= half_step)
{
vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8)));
src0 += half_step;
@@ -128,7 +129,7 @@ void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, uin
dst += half_step;
}
- for(; len > 0; --len)
+ for (; len > 0; --len)
{
*dst = (*src0) || (*src1);
++src0;
@@ -146,21 +147,21 @@ void neon_logical_or_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8_
const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s);
const auto broadcast_val_clamped_x8 = vdup_n_u8(broadcast_val_clamped_s);
- for(; len >= step; len -= step)
+ for (; len >= step; len -= step)
{
vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16));
src += step;
dst += step;
}
- for(; len >= half_step; len -= half_step)
+ for (; len >= half_step; len -= half_step)
{
vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8));
src += half_step;
dst += half_step;
}
- for(; len > 0; --len)
+ for (; len > 0; --len)
{
*dst = (*src) || broadcast_val_clamped_s;
++src;
@@ -173,21 +174,21 @@ void neon_logical_not(const uint8_t *src, uint8_t *dst, uint32_t len)
ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
- for(; len >= step; len -= step)
+ for (; len >= step; len -= step)
{
vst1q_u8(dst, vbslq_u8(vceqq_u8(vld1q_u8(src), c0_x16), c1_x16, c0_x16));
src += step;
dst += step;
}
- for(; len >= half_step; len -= half_step)
+ for (; len >= half_step; len -= half_step)
{
vst1_u8(dst, vbsl_u8(vceq_u8(vld1_u8(src), c0_x8), c1_x8, c0_x8));
src += half_step;
dst += half_step;
}
- for(; len > 0; --len)
+ for (; len > 0; --len)
{
*dst = !(*src);
++src;
@@ -197,18 +198,15 @@ void neon_logical_not(const uint8_t *src, uint8_t *dst, uint32_t len)
void run_unary(const Window &window, const ITensor *src, ITensor *dst)
{
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
const auto len = window.x().end() - window.x().start();
Iterator in(src, win);
Iterator out(dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- neon_logical_not(in.ptr(), out.ptr(), len);
- },
- in, out);
+ execute_window_loop(
+ win, [&](const Coordinates &) { neon_logical_not(in.ptr(), out.ptr(), len); }, in, out);
}
void run_binary(const Window &window, const ITensor *src0, const ITensor *src1, ITensor *dst, LogicalOperation op)
@@ -216,16 +214,17 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1,
Window src0_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
Window src1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
- Window win{ window };
+ Window win{window};
win.set(Window::DimX, Window::Dimension(0, 1, 1));
const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
const auto len = window.x().end() - window.x().start();
- if(is_broadcast_across_x)
+ if (is_broadcast_across_x)
{
- using LogicalBroadcastUKernelPtr = std::add_pointer<void(const uint8_t *, uint8_t, uint8_t *, uint32_t)>::type;
- LogicalBroadcastUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or_broadcast : &neon_logical_and_broadcast;
+ using LogicalBroadcastUKernelPtr = std::add_pointer<void(const uint8_t *, uint8_t, uint8_t *, uint32_t)>::type;
+ LogicalBroadcastUKernelPtr logical_func =
+ op == LogicalOperation::Or ? &neon_logical_or_broadcast : &neon_logical_and_broadcast;
const bool is_broadcast_input_1 = src1_win.x().step() == 0;
Window broadcast_win = is_broadcast_input_1 ? src1_win : src0_win;
@@ -238,17 +237,18 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1,
Iterator non_broadcast_in(non_broadcast_tensor, non_broadcast_win);
Iterator out(dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const uint8_t broadcast_value = *broadcast_in.ptr();
- logical_func(non_broadcast_in.ptr(), broadcast_value, out.ptr(), len);
-
- },
- broadcast_in, non_broadcast_in, out);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const uint8_t broadcast_value = *broadcast_in.ptr();
+ logical_func(non_broadcast_in.ptr(), broadcast_value, out.ptr(), len);
+ },
+ broadcast_in, non_broadcast_in, out);
}
else
{
- using LogicalUKernelPtr = std::add_pointer<void(const uint8_t *, const uint8_t *, uint8_t *, uint32_t)>::type;
+ using LogicalUKernelPtr = std::add_pointer<void(const uint8_t *, const uint8_t *, uint8_t *, uint32_t)>::type;
LogicalUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or : &neon_logical_and;
src0_win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -257,11 +257,8 @@ void run_binary(const Window &window, const ITensor *src0, const ITensor *src1,
Iterator in0(src0, src0_win);
Iterator in1(src1, src1_win);
Iterator out(dst, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- logical_func(in0.ptr(), in1.ptr(), out.ptr(), len);
- },
- in0, in1, out);
+ execute_window_loop(
+ win, [&](const Coordinates &) { logical_func(in0.ptr(), in1.ptr(), out.ptr(), len); }, in0, in1, out);
}
}
} // namespace
@@ -270,7 +267,10 @@ const char *NELogicalKernel::name() const
return "NELogicalKernel";
}
-void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, LogicalOperation op)
+void NELogicalKernel::configure(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ ITensorInfo *output,
+ LogicalOperation op)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, output);
ARM_COMPUTE_ERROR_THROW_ON(validate(input1, input2, output, op));
@@ -279,7 +279,7 @@ void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *in
Window win = calculate_max_window(*input1, Steps());
TensorShape out_shape = input1->tensor_shape();
- if(op != LogicalOperation::Not)
+ if (op != LogicalOperation::Not)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input2);
out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
@@ -292,13 +292,16 @@ void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *in
set_data_type_if_unknown(*output, input1->data_type());
}
-Status NELogicalKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op)
+Status NELogicalKernel::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ LogicalOperation op)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
ARM_COMPUTE_RETURN_ERROR_ON(op == LogicalOperation::Unknown);
TensorShape out_shape = input1->tensor_shape();
- if(op != LogicalOperation::Not)
+ if (op != LogicalOperation::Not)
{
out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
@@ -306,7 +309,7 @@ Status NELogicalKernel::validate(const ITensorInfo *input1, const ITensorInfo *i
}
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
@@ -326,7 +329,7 @@ void NELogicalKernel::run_op(ITensorPack &tensors, const Window &window, const T
const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
ITensor *dst = tensors.get_tensor(TensorType::ACL_DST);
- if(_op == LogicalOperation::Not)
+ if (_op == LogicalOperation::Not)
{
run_unary(window, src0, dst);
}
diff --git a/src/core/NEON/kernels/NELogicalKernel.h b/src/core/NEON/kernels/NELogicalKernel.h
index caf69cf45d..477a59d826 100644
--- a/src/core/NEON/kernels/NELogicalKernel.h
+++ b/src/core/NEON/kernels/NELogicalKernel.h
@@ -58,10 +58,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op);
+ static Status
+ validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op);
// Inherited methods overridden:
- void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
const char *name() const override;
private:
diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
index 37e88a8565..451031d696 100644
--- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
@@ -28,12 +28,13 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include "src/cpu/kernels/meanstddevnorm/list.h"
namespace arm_compute
@@ -46,7 +47,8 @@ struct MeanStdDevNormSelectorData
};
using MeanStdDevNormSelctorPtr = std::add_pointer<bool(const MeanStdDevNormSelectorData &data)>::type;
-using MeanStdDevNormUKernelPtr = std::add_pointer<void(ITensor *input, ITensor *output, float epsilon, const Window &window)>::type;
+using MeanStdDevNormUKernelPtr =
+ std::add_pointer<void(ITensor *input, ITensor *output, float epsilon, const Window &window)>::type;
struct MeanStdDevNormKernel
{
@@ -55,25 +57,15 @@ struct MeanStdDevNormKernel
MeanStdDevNormUKernelPtr ukernel;
};
-static const std::vector<MeanStdDevNormKernel> available_kernels =
-{
- {
- "fp32_neon_meanstddevnorm",
- [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm)
- },
+static const std::vector<MeanStdDevNormKernel> available_kernels = {
+ {"fp32_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm)},
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- {
- "fp16_neon_meanstddevnorm",
- [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F16; },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm)
- },
+ {"fp16_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm)},
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- {
- "qasymm8_neon_meanstddevnorm",
- [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::QASYMM8; },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_meanstddevnorm)
- },
+ {"qasymm8_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::QASYMM8; },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_meanstddevnorm)},
};
/** Micro-kernel selector
@@ -84,9 +76,9 @@ static const std::vector<MeanStdDevNormKernel> available_kernels =
*/
const MeanStdDevNormKernel *get_implementation(const MeanStdDevNormSelectorData &data)
{
- for(const auto &uk : available_kernels)
+ for (const auto &uk : available_kernels)
{
- if(uk.is_selected(data))
+ if (uk.is_selected(data))
{
return &uk;
}
@@ -103,7 +95,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8);
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -113,7 +105,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f
std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
{
- if(output != nullptr)
+ if (output != nullptr)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
// Output auto inizialitation if not yet initialized
@@ -128,8 +120,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
}
} // namespace
-NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel()
- : _input(nullptr), _output(nullptr), _epsilon(1e-8f)
+NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel() : _input(nullptr), _output(nullptr), _epsilon(1e-8f)
{
}
@@ -137,7 +128,8 @@ void NEMeanStdDevNormalizationKernel::configure(ITensor *input, ITensor *output,
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
- ARM_COMPUTE_ERROR_THROW_ON(NEMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
+ ARM_COMPUTE_ERROR_THROW_ON(NEMeanStdDevNormalizationKernel::validate(
+ input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
_input = input;
_output = (output == nullptr) ? input : output;
@@ -152,7 +144,9 @@ void NEMeanStdDevNormalizationKernel::configure(ITensor *input, ITensor *output,
Status NEMeanStdDevNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float epsilon)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, epsilon));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr)
+ .first);
return Status{};
}
@@ -162,7 +156,7 @@ void NEMeanStdDevNormalizationKernel::run(const Window &window, const ThreadInfo
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- const auto *uk = get_implementation(MeanStdDevNormSelectorData{ _output->info()->data_type() });
+ const auto *uk = get_implementation(MeanStdDevNormSelectorData{_output->info()->data_type()});
ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
uk->ukernel(_input, _output, _epsilon, window);
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index 49a045382d..2c61bda147 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -29,19 +29,23 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/NormalizationHelpers.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *input_squared,
+ const ITensorInfo *output,
+ const NormalizationLayerInfo &norm_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_squared, output);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
@@ -52,7 +56,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squ
ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -69,7 +73,10 @@ NENormalizationLayerKernel::NENormalizationLayerKernel()
{
}
-void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info)
+void NENormalizationLayerKernel::configure(const ITensor *input,
+ const ITensor *input_squared,
+ ITensor *output,
+ NormalizationLayerInfo norm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_squared, output);
// Output tensor auto initialization if not yet initialized
@@ -85,15 +92,15 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
_output = output;
_norm_info = norm_info;
- switch(_input->info()->data_type())
+ switch (_input->info()->data_type())
{
case DataType::F32:
{
- switch(norm_idx)
+ switch (norm_idx)
{
case 0:
{
- if(norm_info.type() == NormType::IN_MAP_2D)
+ if (norm_info.type() == NormType::IN_MAP_2D)
{
_func = &NENormalizationLayerKernel::normalize_float<float, 4, 0, true>;
}
@@ -104,7 +111,7 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
break;
}
case 1:
- if(norm_info.type() == NormType::IN_MAP_2D)
+ if (norm_info.type() == NormType::IN_MAP_2D)
{
_func = &NENormalizationLayerKernel::normalize_float<float, 4, 1, true>;
}
@@ -124,11 +131,11 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
{
- switch(norm_idx)
+ switch (norm_idx)
{
case 0:
{
- if(norm_info.type() == NormType::IN_MAP_2D)
+ if (norm_info.type() == NormType::IN_MAP_2D)
{
_func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 0, true>;
}
@@ -139,7 +146,7 @@ void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *
break;
}
case 1:
- if(norm_info.type() == NormType::IN_MAP_2D)
+ if (norm_info.type() == NormType::IN_MAP_2D)
{
_func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 1, true>;
}
@@ -196,8 +203,9 @@ void NENormalizationLayerKernel::normalize_float(const Window &window)
const auto beta_vec = wrapper::vdup_n(static_cast<T>(_norm_info.beta()), ExactTagType{});
const auto kappa_vec = wrapper::vdup_n(static_cast<T>(_norm_info.kappa()), ExactTagType{});
- auto sequential_normalization = [&](const int x, const Coordinates & id, const int current_row, const int first_row, const int last_row, const T * input_ptr, const uint8_t *input_squared_start_ptr,
- T * output_ptr)
+ auto sequential_normalization = [&](const int x, const Coordinates &id, const int current_row, const int first_row,
+ const int last_row, const T *input_ptr, const uint8_t *input_squared_start_ptr,
+ T *output_ptr)
{
const int current_slice = dim == 0 ? x : id[dim];
const int first_slice = std::max(current_slice - radius, 0);
@@ -206,75 +214,87 @@ void NENormalizationLayerKernel::normalize_float(const Window &window)
const uint8_t *const input_squared_x_ptr = input_squared_start_ptr + x * input_squared_stride_x;
// Accumulate 2D In-Map values
auto accu = static_cast<T>(0.f);
- for(int j = first_row; j <= last_row; ++j)
+ for (int j = first_row; j <= last_row; ++j)
{
// Compute row displacement
const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
- for(int i = first_slice; i <= last_slice; ++i)
+ for (int i = first_slice; i <= last_slice; ++i)
{
- accu += *reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice);
+ accu +=
+ *reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice);
}
}
// Normalize
- const auto normalized = std::pow(accu * static_cast<T>(_norm_info.scale_coeff()) + static_cast<T>(_norm_info.kappa()), _norm_info.beta());
+ const auto normalized = std::pow(
+ accu * static_cast<T>(_norm_info.scale_coeff()) + static_cast<T>(_norm_info.kappa()), _norm_info.beta());
const auto normalized_pixel = (*(input_ptr + x)) / normalized;
*(output_ptr + x) = normalized_pixel;
};
- execute_window_loop(win, [&](const Coordinates & id)
- {
- const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
- auto output_ptr = reinterpret_cast<T *>(output.ptr());
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
+ {
+ const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
+ auto output_ptr = reinterpret_cast<T *>(output.ptr());
- // Get range to normalize
- const int current_row = do_2D_norm ? id[dim_y] : 0;
- const int first_row = do_2D_norm ? std::max(current_row - radius, 0) : 0;
- const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+ // Get range to normalize
+ const int current_row = do_2D_norm ? id[dim_y] : 0;
+ const int first_row = do_2D_norm ? std::max(current_row - radius, 0) : 0;
+ const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
- int x = window_start_x;
- // Compute serially starting elements for the case x dimension is width
- for(; x < radius && x < window_end_x && dim == 0; ++x)
- {
- sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), output_ptr);
- }
+ int x = window_start_x;
+ // Compute serially starting elements for the case x dimension is width
+ for (; x < radius && x < window_end_x && dim == 0; ++x)
+ {
+ sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(),
+ output_ptr);
+ }
- // Compute vectorized
- for(; x <= window_end_x - window_step_x - radius; x += window_step_x)
- {
- const int current_slice = dim == 0 ? x : id[dim];
- const int first_slice = std::max(current_slice - radius, 0);
- const int last_slice = std::min(current_slice + radius, max_right);
-
- const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x;
- // Accumulate 2D In-Map values
- auto accu = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
- for(int j = first_row; j <= last_row; ++j)
+ // Compute vectorized
+ for (; x <= window_end_x - window_step_x - radius; x += window_step_x)
{
- // Compute row displacement
- const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
- for(int i = first_slice; i <= last_slice; ++i)
+ const int current_slice = dim == 0 ? x : id[dim];
+ const int first_slice = std::max(current_slice - radius, 0);
+ const int last_slice = std::min(current_slice + radius, max_right);
+
+ const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x;
+ // Accumulate 2D In-Map values
+ auto accu = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ for (int j = first_row; j <= last_row; ++j)
{
- accu = wrapper::vadd(accu, wrapper::vloadq(reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice)));
+ // Compute row displacement
+ const uint8_t *const input_squared_ptr =
+ input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
+ for (int i = first_slice; i <= last_slice; ++i)
+ {
+ accu = wrapper::vadd(
+ accu, wrapper::vloadq(reinterpret_cast<const T *>(
+ input_squared_ptr + (i - current_slice) * input_squared_stride_slice)));
+ }
}
- }
- // Normalize
- const auto normalized = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec);
- const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized));
- wrapper::vstore(reinterpret_cast<T *>(output_ptr + x), normalized_pixel);
- }
+ // Normalize
+ const auto normalized = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec);
+ const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized));
+ wrapper::vstore(reinterpret_cast<T *>(output_ptr + x), normalized_pixel);
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), output_ptr);
- }
- },
- input, input_squared, output);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(),
+ output_ptr);
+ }
+ },
+ input, input_squared, output);
}
-Status NENormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo norm_info)
+Status NENormalizationLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *input_squared,
+ const ITensorInfo *output,
+ const NormalizationLayerInfo norm_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, input_squared, output, norm_info));
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.h b/src/core/NEON/kernels/NENormalizationLayerKernel.h
index 53a06b9ed9..2d8d9f3d60 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.h
@@ -60,7 +60,8 @@ public:
* @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type and layout supported: same as @p input.
* @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters.
*/
- void configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info);
+ void
+ configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info);
/** Static function to check if given info will lead to a valid configuration of @ref NENormalizationLayerKernel
*
* @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
@@ -72,7 +73,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, NormalizationLayerInfo norm_info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *input_squared,
+ const ITensorInfo *output,
+ NormalizationLayerInfo norm_info);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEPadLayerKernel.cpp b/src/core/NEON/kernels/NEPadLayerKernel.cpp
index 734510b637..c9bcbc9127 100644
--- a/src/core/NEON/kernels/NEPadLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPadLayerKernel.cpp
@@ -28,26 +28,31 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &paddings, const PaddingMode mode)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PaddingList &paddings,
+ const PaddingMode mode)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(mode != PaddingMode::CONSTANT, "Only constant padding mode is supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(paddings.size() > 4, "Padding list bigger than 4 dimensions");
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input->tensor_shape(), paddings);
- const TensorInfo expected_output_info = input->clone()->set_tensor_shape(expected_output_shape);
+ const TensorShape expected_output_shape =
+ arm_compute::misc::shape_calculator::compute_padded_shape(input->tensor_shape(), paddings);
+ const TensorInfo expected_output_info = input->clone()->set_tensor_shape(expected_output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output_info);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -58,30 +63,34 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
template <typename T>
void NEPadLayerKernel::run_pad_constant(const Window &window)
{
- Window output_window{ window };
+ Window output_window{window};
output_window.set(Window::DimX, Window::Dimension(0, 1, 1));
const size_t element_size = _input->info()->element_size();
Iterator output_it(_output, output_window);
- execute_window_loop(output_window, [&](const Coordinates & id)
- {
- Coordinates idin{ id };
- for(size_t dim = _padding.size() - 1; dim > 0; --dim)
+ execute_window_loop(
+ output_window,
+ [&](const Coordinates &id)
{
- idin[dim] -= _padding[dim].first;
- if(idin[dim] < 0 || static_cast<int>(_input->info()->dimension(dim)) - 1 < idin[dim])
+ Coordinates idin{id};
+ for (size_t dim = _padding.size() - 1; dim > 0; --dim)
{
- std::fill_n(reinterpret_cast<T *>(output_it.ptr()), _output->info()->dimension(0), _constant_value.get<T>());
- return;
+ idin[dim] -= _padding[dim].first;
+ if (idin[dim] < 0 || static_cast<int>(_input->info()->dimension(dim)) - 1 < idin[dim])
+ {
+ std::fill_n(reinterpret_cast<T *>(output_it.ptr()), _output->info()->dimension(0),
+ _constant_value.get<T>());
+ return;
+ }
}
- }
- T *input_it_ptr = reinterpret_cast<T *>(_input->ptr_to_element(idin));
- T *output_it_ptr = reinterpret_cast<T *>(output_it.ptr());
- std::fill_n(output_it_ptr, _padding[0].first, _constant_value.get<T>());
- memcpy(output_it_ptr + _padding[0].first, input_it_ptr, _input->info()->dimension(0) * element_size);
- std::fill_n(output_it_ptr + _padding[0].first + _input->info()->dimension(0), _padding[0].second, _constant_value.get<T>());
- },
- output_it);
+ T *input_it_ptr = reinterpret_cast<T *>(_input->ptr_to_element(idin));
+ T *output_it_ptr = reinterpret_cast<T *>(output_it.ptr());
+ std::fill_n(output_it_ptr, _padding[0].first, _constant_value.get<T>());
+ memcpy(output_it_ptr + _padding[0].first, input_it_ptr, _input->info()->dimension(0) * element_size);
+ std::fill_n(output_it_ptr + _padding[0].first + _input->info()->dimension(0), _padding[0].second,
+ _constant_value.get<T>());
+ },
+ output_it);
}
void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window)
@@ -92,7 +101,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window
const size_t end_plane = window.z().end();
size_t start_plane_input = start_plane;
- if(_padding.size() > 2)
+ if (_padding.size() > 2)
{
start_plane_input = (start_plane < _padding[2].first) ? 0 : start_plane - _padding[2].first;
}
@@ -105,18 +114,20 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window
const size_t jump_to_next_row_input = _input->info()->dimension(0);
const size_t jump_to_next_row_output = _padding[0].first + _padding[0].second;
- uint8_t *output_row_ptr = _output->buffer() + _output->info()->offset_first_element_in_bytes() + start_plane * output_plane_size;
- const uint8_t *input_it_ptr = _input->buffer() + _input->info()->offset_first_element_in_bytes() + start_plane_input * input_plane_size;
- const auto pad_value = _constant_value.get<uint8_t>();
+ uint8_t *output_row_ptr =
+ _output->buffer() + _output->info()->offset_first_element_in_bytes() + start_plane * output_plane_size;
+ const uint8_t *input_it_ptr =
+ _input->buffer() + _input->info()->offset_first_element_in_bytes() + start_plane_input * input_plane_size;
+ const auto pad_value = _constant_value.get<uint8_t>();
- for(size_t z_i = start_plane; z_i < end_plane; ++z_i)
+ for (size_t z_i = start_plane; z_i < end_plane; ++z_i)
{
- if(_padding.size() > 2 && z_i < _padding[2].first)
+ if (_padding.size() > 2 && z_i < _padding[2].first)
{
memset(output_row_ptr, pad_value, output_plane_size);
output_row_ptr += output_plane_size;
}
- else if(_padding.size() > 2 && z_i > (_input->info()->dimension(2) + _padding[2].first - 1))
+ else if (_padding.size() > 2 && z_i > (_input->info()->dimension(2) + _padding[2].first - 1))
{
memset(output_row_ptr, pad_value, output_plane_size);
output_row_ptr += output_plane_size;
@@ -127,7 +138,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window
output_row_ptr += pad_y_elems_top;
size_t y_i = _input->info()->dimension(1);
// Basic loop unrolling
- for(; y_i > 3; y_i -= 4)
+ for (; y_i > 3; y_i -= 4)
{
memset(output_row_ptr, pad_value, _padding[0].first);
output_row_ptr += _padding[0].first;
@@ -160,7 +171,7 @@ void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window
memset(output_row_ptr, pad_value, _padding[0].second);
output_row_ptr += _padding[0].second;
}
- for(; y_i > 0; --y_i)
+ for (; y_i > 0; --y_i)
{
memset(output_row_ptr, pad_value, _padding[0].first);
output_row_ptr += _padding[0].first;
@@ -183,12 +194,17 @@ NEPadLayerKernel::NEPadLayerKernel()
{
}
-void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+void NEPadLayerKernel::configure(ITensor *input,
+ ITensor *output,
+ const PaddingList &padding,
+ const PixelValue constant_value,
+ const PaddingMode mode)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
// Auto-init
- const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding);
- const TensorInfo expected_output_info = input->info()->clone()->set_tensor_shape(expected_output_shape);
+ const TensorShape expected_output_shape =
+ arm_compute::misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding);
+ const TensorInfo expected_output_info = input->info()->clone()->set_tensor_shape(expected_output_shape);
auto_init_if_empty(*output->info(), expected_output_info);
// Perform validation step
@@ -200,14 +216,14 @@ void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingL
_constant_value = constant_value;
_mode = mode;
- if(_mode == PaddingMode::CONSTANT)
+ if (_mode == PaddingMode::CONSTANT)
{
- switch(_input->info()->element_size())
+ switch (_input->info()->element_size())
{
case 1:
- if(_input->info()->num_dimensions() == 3 && // Is 3D
- padding.size() <= 3 && // Has 3D padding
- !_input->info()->has_padding() && !_output->info()->has_padding()) // Input & Output have no padding
+ if (_input->info()->num_dimensions() == 3 && // Is 3D
+ padding.size() <= 3 && // Has 3D padding
+ !_input->info()->has_padding() && !_output->info()->has_padding()) // Input & Output have no padding
{
_func = &NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad;
}
@@ -240,7 +256,11 @@ void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingL
ICPPKernel::configure(win);
}
-Status NEPadLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+Status NEPadLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PaddingList &padding,
+ const PixelValue constant_value,
+ const PaddingMode mode)
{
ARM_COMPUTE_UNUSED(constant_value);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, mode));
@@ -253,7 +273,7 @@ void NEPadLayerKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- if(_func != nullptr)
+ if (_func != nullptr)
{
(this->*_func)(window);
}
@@ -263,7 +283,7 @@ size_t NEPadLayerKernel::get_mws(const CPUInfo &platform, size_t thread_count) c
{
ARM_COMPUTE_UNUSED(thread_count);
ARM_COMPUTE_UNUSED(platform);
-
+
return ICPPKernel::default_mws;
}
diff --git a/src/core/NEON/kernels/NEPadLayerKernel.h b/src/core/NEON/kernels/NEPadLayerKernel.h
index f82af1558a..d432887d2c 100644
--- a/src/core/NEON/kernels/NEPadLayerKernel.h
+++ b/src/core/NEON/kernels/NEPadLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NEPADLAYERKERNEL_H
#include "arm_compute/core/PixelValue.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
@@ -62,7 +63,11 @@ public:
* @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT.
* Only CONSTANT padding mode is currently supported
*/
- void configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+ void configure(ITensor *input,
+ ITensor *output,
+ const PaddingList &padding,
+ const PixelValue constant_value = PixelValue(),
+ const PaddingMode mode = PaddingMode::CONSTANT);
/** Static function to check if given info will lead to a valid configuration of @ref NEPadLayer.
*
* @param[in] input Source tensor info. Data types supported: All.
@@ -75,7 +80,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PaddingList &padding,
+ const PixelValue constant_value = PixelValue(),
+ const PaddingMode mode = PaddingMode::CONSTANT);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
index 3d89933377..15e933e66e 100644
--- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
@@ -27,6 +27,7 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -36,7 +37,10 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status validate_arguments(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const PriorBoxLayerInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
@@ -45,10 +49,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
// Check variances
const int var_size = info.variances().size();
- if(var_size > 1)
+ if (var_size > 1)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values");
- for(int i = 0; i < var_size; ++i)
+ for (int i = 0; i < var_size; ++i)
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0");
}
@@ -56,17 +60,19 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0");
- if(!info.max_sizes().empty())
+ if (!info.max_sizes().empty())
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(),
+ "Max and min sizes dimensions should match");
}
- for(unsigned int i = 0; i < info.max_sizes().size(); ++i)
+ for (unsigned int i = 0; i < info.max_sizes().size(); ++i)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i],
+ "Max size should be greater than min size");
}
- if(output != nullptr && output->total_size() != 0)
+ if (output != nullptr && output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
@@ -76,21 +82,26 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
}
} // namespace
-NEPriorBoxLayerKernel::NEPriorBoxLayerKernel()
- : _input1(nullptr), _input2(nullptr), _output(nullptr), _info()
+NEPriorBoxLayerKernel::NEPriorBoxLayerKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr), _info()
{
}
-void NEPriorBoxLayerKernel::store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width,
- const int height)
+void NEPriorBoxLayerKernel::store_coordinates(float *out,
+ const int offset,
+ const float center_x,
+ const float center_y,
+ const float box_width,
+ const float box_height,
+ const int width,
+ const int height)
{
float xmin = (center_x - box_width / 2.f) / width;
float ymin = (center_y - box_height / 2.f) / height;
float xmax = (center_x + box_width / 2.f) / width;
float ymax = (center_y + box_height / 2.f) / height;
- float32x4_t vec_elements = { xmin, ymin, xmax, ymax };
- if(_info.clip())
+ float32x4_t vec_elements = {xmin, ymin, xmax, ymax};
+ if (_info.clip())
{
static const float32x4_t CONST_0 = vdupq_n_f32(0.f);
static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
@@ -112,7 +123,7 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window)
int img_width = _info.img_size().x;
int img_height = _info.img_size().y;
- if(img_width == 0 || img_height == 0)
+ if (img_width == 0 || img_height == 0)
{
img_width = _input2->info()->dimension(width_idx);
img_height = _input2->info()->dimension(height_idx);
@@ -120,7 +131,7 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window)
float step_x = _info.steps()[0];
float step_y = _info.steps()[1];
- if(step_x == 0.f || step_y == 0.f)
+ if (step_x == 0.f || step_y == 0.f)
{
step_x = static_cast<float>(img_width) / layer_width;
step_y = static_cast<float>(img_height) / layer_height;
@@ -130,74 +141,80 @@ void NEPriorBoxLayerKernel::calculate_prior_boxes(const Window &window)
slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
Iterator output(_output, slice);
- execute_window_loop(slice, [&](const Coordinates & id)
- {
- float center_x = 0;
- float center_y = 0;
- int idx = id.x() / (4 * num_priors);
- center_x = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
- center_y = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
-
- float box_width;
- float box_height;
- int offset = 0;
-
- auto out = reinterpret_cast<float *>(output.ptr());
- for(unsigned int i = 0; i < _info.min_sizes().size(); ++i)
+ execute_window_loop(
+ slice,
+ [&](const Coordinates &id)
{
- const float min_size = _info.min_sizes().at(i);
- box_width = min_size;
- box_height = min_size;
- store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
- offset += 4;
-
- if(!_info.max_sizes().empty())
+ float center_x = 0;
+ float center_y = 0;
+ int idx = id.x() / (4 * num_priors);
+ center_x = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
+ center_y = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
+
+ float box_width;
+ float box_height;
+ int offset = 0;
+
+ auto out = reinterpret_cast<float *>(output.ptr());
+ for (unsigned int i = 0; i < _info.min_sizes().size(); ++i)
{
- const float max_size = _info.max_sizes().at(i);
- box_width = std::sqrt(min_size * max_size);
- box_height = box_width;
-
+ const float min_size = _info.min_sizes().at(i);
+ box_width = min_size;
+ box_height = min_size;
store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
offset += 4;
- }
- // rest of priors
- for(auto ar : _info.aspect_ratios())
- {
- if(fabs(ar - 1.) < 1e-6)
+ if (!_info.max_sizes().empty())
{
- continue;
+ const float max_size = _info.max_sizes().at(i);
+ box_width = std::sqrt(min_size * max_size);
+ box_height = box_width;
+
+ store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+ offset += 4;
}
- box_width = min_size * sqrt(ar);
- box_height = min_size / sqrt(ar);
+ // rest of priors
+ for (auto ar : _info.aspect_ratios())
+ {
+ if (fabs(ar - 1.) < 1e-6)
+ {
+ continue;
+ }
- store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
- offset += 4;
+ box_width = min_size * sqrt(ar);
+ box_height = min_size / sqrt(ar);
+
+ store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+ offset += 4;
+ }
}
- }
- // set the variance
- out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1)));
- float32x4_t var;
- if(_info.variances().size() == 1)
- {
- var = vdupq_n_f32(_info.variances().at(0));
- }
- else
- {
- const float32x4_t vars = { _info.variances().at(0), _info.variances().at(1), _info.variances().at(2), _info.variances().at(3) };
- var = vars;
- }
- for(int i = 0; i < num_priors; ++i)
- {
- vst1q_f32(out + 4 * i, var);
- }
- },
- output);
+ // set the variance
+ out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1)));
+ float32x4_t var;
+ if (_info.variances().size() == 1)
+ {
+ var = vdupq_n_f32(_info.variances().at(0));
+ }
+ else
+ {
+ const float32x4_t vars = {_info.variances().at(0), _info.variances().at(1), _info.variances().at(2),
+ _info.variances().at(3)};
+ var = vars;
+ }
+ for (int i = 0; i < num_priors; ++i)
+ {
+ vst1q_f32(out + 4 * i, var);
+ }
+ },
+ output);
}
-void NEPriorBoxLayerKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info)
+void NEPriorBoxLayerKernel::configure(const ITensor *input1,
+ const ITensor *input2,
+ ITensor *output,
+ const PriorBoxLayerInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
@@ -215,7 +232,10 @@ void NEPriorBoxLayerKernel::configure(const ITensor *input1, const ITensor *inpu
INEKernel::configure(win);
}
-Status NEPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status NEPriorBoxLayerKernel::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const PriorBoxLayerInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info));
@@ -231,4 +251,4 @@ void NEPriorBoxLayerKernel::run(const Window &window, const ThreadInfo &info)
// Run function
calculate_prior_boxes(window);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.h b/src/core/NEON/kernels/NEPriorBoxLayerKernel.h
index 430a47f9f8..460f80e085 100644
--- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.h
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.h
@@ -67,7 +67,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+ static Status validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const PriorBoxLayerInfo &info);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
@@ -84,7 +87,14 @@ private:
* @param[in] width Input width.
* @param[in] height Input height.
*/
- void store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width, const int height);
+ void store_coordinates(float *out,
+ const int offset,
+ const float center_x,
+ const float center_y,
+ const float box_width,
+ const float box_height,
+ const int width,
+ const int height);
/** Function to calculate prior boxes.
*
* @param[in] window Input region on which to execute the kernel.
diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
index 46a0f625ce..8e1ed3a2a5 100644
--- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
@@ -26,17 +26,17 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/NESymm.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/NESymm.h"
#include <map>
@@ -72,8 +72,8 @@ inline int64x2x2_t mul_add(const int32x4_t &a, const int32x4_t &b, const int32x4
const int64_t b_3 = vgetlane(b_high, 1);
int64x2x2_t result;
- const int64x2_t result_0{ a_0 * b_0, a_1 * b_1 };
- const int64x2_t result_1{ a_2 * b_2, a_3 * b_3 };
+ const int64x2_t result_0{a_0 * b_0, a_1 * b_1};
+ const int64x2_t result_1{a_2 * b_2, a_3 * b_3};
result.val[0] = vadd(vmovl(vgetlow(bias)), result_0);
result.val[1] = vadd(vmovl(vgethigh(bias)), result_1);
@@ -81,15 +81,17 @@ inline int64x2x2_t mul_add(const int32x4_t &a, const int32x4_t &b, const int32x4
}
} // namespace
-void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, ITensor *output, const ITensor *weight, const ITensor *bias)
+void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input,
+ ITensor *output,
+ const ITensor *weight,
+ const ITensor *bias)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, bias, output);
ARM_COMPUTE_ERROR_ON(input == output);
ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), weight->info(), bias->info()));
- static const std::map<DataType, ComputeFuncType> fn_map =
- {
- { DataType::QSYMM16, std::mem_fn(&NEQLSTMLayerNormalizationKernel::compute_qsymm16) },
+ static const std::map<DataType, ComputeFuncType> fn_map = {
+ {DataType::QSYMM16, std::mem_fn(&NEQLSTMLayerNormalizationKernel::compute_qsymm16)},
};
_input = input;
@@ -102,10 +104,10 @@ void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, ITensor *o
_output->info()->set_quantization_info(compute_output_qinfo());
const UniformQuantizationInfo wq_info = _weight->info()->quantization_info().uniform();
- const Status s = quantization::calculate_quantized_multiplier(wq_info.scale, &_output_multiplier, &_output_shift);
+ const Status s = quantization::calculate_quantized_multiplier(wq_info.scale, &_output_multiplier, &_output_shift);
_output_shift *= -1;
- if(!bool(s))
+ if (!bool(s))
{
_output_multiplier = 0;
_output_shift = 0;
@@ -134,7 +136,10 @@ Window NEQLSTMLayerNormalizationKernel::configure_window(ITensor *target)
return window;
}
-Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
+Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *weight,
+ const ITensorInfo *bias)
{
ARM_COMPUTE_UNUSED(output, bias, weight, input);
@@ -151,7 +156,7 @@ Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const
ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().x() != weight->tensor_shape().x());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(weight, bias);
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -182,11 +187,11 @@ inline std::pair<int64_t, int64_t> NEQLSTMLayerNormalizationKernel::sum_qsymm16(
using AccType = int64_t;
using InputDataType = int16_t;
- AccType sum{ 0 };
- AccType sum_sq{ 0 };
+ AccType sum{0};
+ AccType sum_sq{0};
int32_t x = _window_start_x;
- for(; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
+ for (; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
{
using namespace wrapper;
const int16x8_t val = vloadq(input_ptr + x);
@@ -216,7 +221,7 @@ inline std::pair<int64_t, int64_t> NEQLSTMLayerNormalizationKernel::sum_qsymm16(
#endif // __aarch64__
}
- for(; x < _window_end_x; ++x)
+ for (; x < _window_end_x; ++x)
{
const InputDataType val = input_ptr[x];
sum += static_cast<AccType>(val);
@@ -230,7 +235,9 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i
int16_t *output_ptr,
const int16_t *weight_ptr,
const int32_t *bias_ptr,
- int32_t mean, int32_t inv_std_mul, int32_t inv_std_shift)
+ int32_t mean,
+ int32_t inv_std_mul,
+ int32_t inv_std_shift)
{
using OutputDataType = int16_t;
@@ -238,7 +245,7 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i
const int32x4_t mean_vec = vdup_n(mean, wrapper::traits::vector_128_tag{});
int32_t x = _window_start_x;
- for(; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
+ for (; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
{
const int16x8_t val = vloadq(input_ptr + x);
int32x4x2_t shifted;
@@ -267,16 +274,18 @@ inline void NEQLSTMLayerNormalizationKernel::normalize_qasymm16(const int16_t *i
vstore(output_ptr + x + 4, vqmovn(out_val.val[1]));
}
- for(; x < _window_end_x; ++x)
+ for (; x < _window_end_x; ++x)
{
- const auto val = static_cast<int32_t>(input_ptr[x]);
- const int32_t shifted = (val << 10) - mean;
- const int32_t rescaled = quantization::multiply_by_quantized_multiplier(shifted, inv_std_mul, inv_std_shift);
- const int64_t weighted = rescaled * weight_ptr[x] + bias_ptr[x];
+ const auto val = static_cast<int32_t>(input_ptr[x]);
+ const int32_t shifted = (val << 10) - mean;
+ const int32_t rescaled = quantization::multiply_by_quantized_multiplier(shifted, inv_std_mul, inv_std_shift);
+ const int64_t weighted = rescaled * weight_ptr[x] + bias_ptr[x];
const auto reverse_shifted = static_cast<int32_t>((weighted + 512) >> 10);
- int32_t out_val = quantization::multiply_by_quantized_multiplier(reverse_shifted, _output_multiplier, _output_shift + 12);
- out_val = utility::clamp<decltype(out_val), OutputDataType>(out_val, std::numeric_limits<OutputDataType>::min());
- output_ptr[x] = static_cast<OutputDataType>(out_val);
+ int32_t out_val =
+ quantization::multiply_by_quantized_multiplier(reverse_shifted, _output_multiplier, _output_shift + 12);
+ out_val =
+ utility::clamp<decltype(out_val), OutputDataType>(out_val, std::numeric_limits<OutputDataType>::min());
+ output_ptr[x] = static_cast<OutputDataType>(out_val);
}
}
@@ -287,35 +296,38 @@ void NEQLSTMLayerNormalizationKernel::compute_qsymm16()
using BiasDataType = int32_t;
using AccType = int64_t;
- Iterator input_iterator{ _input, _inout_window };
- Iterator output_iterator{ _output, _inout_window };
- Iterator weight_iterator{ _weight, _weight_window };
- Iterator bias_iterator{ _bias, _weight_window };
+ Iterator input_iterator{_input, _inout_window};
+ Iterator output_iterator{_output, _inout_window};
+ Iterator weight_iterator{_weight, _weight_window};
+ Iterator bias_iterator{_bias, _weight_window};
const auto weight_ptr = reinterpret_cast<const InputDataType *>(weight_iterator.ptr());
const auto bias_ptr = reinterpret_cast<const BiasDataType *>(bias_iterator.ptr());
const uint32_t column_size = _input->info()->tensor_shape()[0];
- execute_window_loop(_inout_window, [ &, this](const Coordinates &)
- {
- const auto in_ptr = reinterpret_cast<const InputDataType *>(input_iterator.ptr());
- auto out_ptr = reinterpret_cast<OutputDataType *>(output_iterator.ptr());
-
- AccType sum{ 0 };
- AccType sum_sq{ 0 };
- std::tie(sum, sum_sq) = sum_qsymm16(in_ptr);
-
- AccType mean{ 0 };
- AccType variance{ 0 };
- std::tie(mean, variance) = compute_mean_variance(sum, sum_sq, column_size);
-
- int32_t stddev_invsqrt_mul{};
- int32_t stddev_invsqrt_shift{};
- quantization::get_invsqrt_quantized_multiplier_exp(static_cast<int32_t>(variance), -1, stddev_invsqrt_mul, stddev_invsqrt_shift);
-
- normalize_qasymm16(in_ptr, out_ptr, weight_ptr, bias_ptr, mean, stddev_invsqrt_mul, stddev_invsqrt_shift);
- },
- input_iterator, output_iterator);
+ execute_window_loop(
+ _inout_window,
+ [&, this](const Coordinates &)
+ {
+ const auto in_ptr = reinterpret_cast<const InputDataType *>(input_iterator.ptr());
+ auto out_ptr = reinterpret_cast<OutputDataType *>(output_iterator.ptr());
+
+ AccType sum{0};
+ AccType sum_sq{0};
+ std::tie(sum, sum_sq) = sum_qsymm16(in_ptr);
+
+ AccType mean{0};
+ AccType variance{0};
+ std::tie(mean, variance) = compute_mean_variance(sum, sum_sq, column_size);
+
+ int32_t stddev_invsqrt_mul{};
+ int32_t stddev_invsqrt_shift{};
+ quantization::get_invsqrt_quantized_multiplier_exp(static_cast<int32_t>(variance), -1, stddev_invsqrt_mul,
+ stddev_invsqrt_shift);
+
+ normalize_qasymm16(in_ptr, out_ptr, weight_ptr, bias_ptr, mean, stddev_invsqrt_mul, stddev_invsqrt_shift);
+ },
+ input_iterator, output_iterator);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
index a3ff6e988f..af5b6a0315 100644
--- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
+++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NEQLSTMLAYERNORMALIZATIONKERNEL_H
#include "src/core/NEON/INEKernel.h"
+
#include <functional>
namespace arm_compute
@@ -69,34 +70,26 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
private:
// constants
- static constexpr uint32_t max_input_dimension{ 2 }; /**< The maximum input dimension supported */
- static constexpr uint32_t max_weight_dimension{ 1 }; /**< The maximum weight dimension supported */
- static constexpr uint32_t max_bias_dimension{ 1 }; /**< The maximum bias dimension supported */
- static constexpr uint32_t vector_size_byte{ 16 }; /**< Computation vector size in byte */
+ static constexpr uint32_t max_input_dimension{2}; /**< The maximum input dimension supported */
+ static constexpr uint32_t max_weight_dimension{1}; /**< The maximum weight dimension supported */
+ static constexpr uint32_t max_bias_dimension{1}; /**< The maximum bias dimension supported */
+ static constexpr uint32_t vector_size_byte{16}; /**< Computation vector size in byte */
using ComputeFuncType = std::function<void(NEQLSTMLayerNormalizationKernel &)>;
ComputeFuncType _fn{}; /**< Function pointer to computation function */
- const ITensor *_input
- {
- nullptr
- }; /**< Input tensor */
- const ITensor *_weight
- {
- nullptr
- }; /**< Weight tensor */
- const ITensor *_bias
- {
- nullptr
- }; /**< Bias tensor */
- ITensor *_output{ nullptr }; /**< Output tensor */
+ const ITensor *_input{nullptr}; /**< Input tensor */
+ const ITensor *_weight{nullptr}; /**< Weight tensor */
+ const ITensor *_bias{nullptr}; /**< Bias tensor */
+ ITensor *_output{nullptr}; /**< Output tensor */
int32_t _output_multiplier{}; /**< Multiplier for output values */
int32_t _output_shift{}; /**< Shift value for output values */
@@ -138,7 +131,9 @@ private:
int16_t *output_ptr,
const int16_t *weight_ptr,
const int32_t *bias_ptr,
- int32_t mean, int32_t inv_std_mul, int32_t inv_std_shift);
+ int32_t mean,
+ int32_t inv_std_mul,
+ int32_t inv_std_shift);
/** Function to compute output quantization information */
QuantizationInfo compute_output_qinfo();
};
diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
index 802aebb526..486cd6d331 100644
--- a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
@@ -26,11 +26,12 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/misc/Utility.h"
-#include "src/core/CPP/Validate.h"
+#include "arm_compute/core/Window.h"
+
#include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
#include "src/cpu/kernels/roialign/list.h"
@@ -49,7 +50,12 @@ struct ROIAlignSelectorData
};
using ROIAlignSelctorPtr = std::add_pointer<bool(const ROIAlignSelectorData &data)>::type;
-using ROIAlignUKernelPtr = std::add_pointer<void(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)>::type;
+using ROIAlignUKernelPtr = std::add_pointer<void(const ITensor *input,
+ ITensor *output,
+ const ITensor *rois,
+ ROIPoolingLayerInfo pool_info,
+ const Window &window,
+ const ThreadInfo &info)>::type;
struct ROIAlignKernel
{
@@ -58,31 +64,18 @@ struct ROIAlignKernel
ROIAlignUKernelPtr ukernel;
};
-static const ROIAlignKernel available_kernels[] =
-{
- {
- "fp32_neon_roialign",
- [](const ROIAlignSelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_roialign)
- },
+static const ROIAlignKernel available_kernels[] = {
+ {"fp32_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_roialign)},
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- {
- "fp16_neon_roialign",
- [](const ROIAlignSelectorData & data) { return data.dt == DataType::F16; },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_roialign)
- },
+ {"fp16_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_roialign)},
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#if defined(ARM_COMPUTE_ENABLE_NEON)
- {
- "qu8_neon_roialign",
- [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8; },
- REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qu8_roialign)
- },
- {
- "qs8_neon_roialign",
- [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
- REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qs8_roialign)
- },
+ {"qu8_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::QASYMM8; },
+ REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qu8_roialign)},
+ {"qs8_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+ REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qs8_roialign)},
#endif //defined(ARM_COMPUTE_ENABLE_NEON)
};
@@ -94,9 +87,9 @@ static const ROIAlignKernel available_kernels[] =
*/
const ROIAlignKernel *get_implementation(const ROIAlignSelectorData &data)
{
- for(const auto &uk : available_kernels)
+ for (const auto &uk : available_kernels)
{
- if(uk.is_selected(data))
+ if (uk.is_selected(data))
{
return &uk;
}
@@ -104,24 +97,29 @@ const ROIAlignKernel *get_implementation(const ROIAlignSelectorData &data)
return nullptr;
}
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5);
ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+ DataType::F32, DataType::F16);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW);
ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info),
+ output->tensor_shape());
}
- if(input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED)
+ if (input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::QASYMM16);
@@ -143,13 +141,17 @@ NEROIAlignLayerKernel::NEROIAlignLayerKernel()
{
}
-void NEROIAlignLayerKernel::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIAlignLayerKernel::configure(const ITensor *input,
+ const ITensor *rois,
+ ITensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
// Output auto inizialitation if not yet initialized
const TensorShape output_shape = compute_roi_align_shape(*input->info(), *rois->info(), pool_info);
- auto_init_if_empty((*output->info()), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+ auto_init_if_empty((*output->info()), output_shape, 1, input->info()->data_type(),
+ input->info()->quantization_info());
output->info()->set_data_layout(input->info()->data_layout());
// Configure kernel window
@@ -167,7 +169,10 @@ void NEROIAlignLayerKernel::configure(const ITensor *input, const ITensor *rois,
INEKernel::configure(window);
}
-Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIAlignLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
return Status{};
@@ -176,9 +181,9 @@ Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorIn
void NEROIAlignLayerKernel::run(const Window &window, const ThreadInfo &info)
{
const DataLayout data_layout = _input->info()->data_layout();
- if(data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC)
+ if (data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC)
{
- const auto *uk = get_implementation(ROIAlignSelectorData{ _input->info()->data_type() });
+ const auto *uk = get_implementation(ROIAlignSelectorData{_input->info()->data_type()});
ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
uk->ukernel(_input, _output, _rois, _pool_info, window, info);
diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.h b/src/core/NEON/kernels/NEROIAlignLayerKernel.h
index 48a3de7285..9cc538b429 100644
--- a/src/core/NEON/kernels/NEROIAlignLayerKernel.h
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.h
@@ -83,7 +83,10 @@ public:
*
* @return a Status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
index 400e8291d6..1a3810fb56 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
@@ -22,9 +22,11 @@
* SOFTWARE.
*/
#include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
+
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
+
#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -36,7 +38,10 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ const ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, rois);
@@ -47,10 +52,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, con
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32, DataType::QASYMM8);
ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height()));
+ ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) ||
+ (output->dimension(1) != pool_info.pooled_height()));
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2));
ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3));
}
@@ -73,19 +79,28 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, con
* @param[in] roi_indx Index of image of coordinate in output Tensor to store value
*/
template <typename T>
-void template_eval(const ITensor *input, const ITensor *output, int region_start_x, int region_start_y,
- int region_end_x, int region_end_y, int fm, int px, int py, int roi_batch, int roi_indx)
+void template_eval(const ITensor *input,
+ const ITensor *output,
+ int region_start_x,
+ int region_start_y,
+ int region_end_x,
+ int region_end_y,
+ int fm,
+ int px,
+ int py,
+ int roi_batch,
+ int roi_indx)
{
- if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+ if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
{
*reinterpret_cast<T *>(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = 0;
}
else
{
T curr_max = std::numeric_limits<T>::lowest(); // Min value of typename T
- for(int j = region_start_y; j < region_end_y; ++j)
+ for (int j = region_start_y; j < region_end_y; ++j)
{
- for(int i = region_start_x; i < region_end_x; ++i)
+ for (int i = region_start_x; i < region_end_x; ++i)
{
const auto val = *reinterpret_cast<const T *>(input->ptr_to_element(Coordinates(i, j, fm, roi_batch)));
curr_max = std::max(val, curr_max);
@@ -93,11 +108,13 @@ void template_eval(const ITensor *input, const ITensor *output, int region_start
}
// if quantized datatype, requantize then store in output tensor
- if(is_data_type_quantized(input->info()->data_type()))
+ if (is_data_type_quantized(input->info()->data_type()))
{
// covert qasymm to new output quantization scale and offset
- UniformQuantizationInfo uqinfo = compute_requantization_scale_offset(input->info()->quantization_info().uniform(), output->info()->quantization_info().uniform());
- *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = quantize_qasymm8(curr_max, uqinfo);
+ UniformQuantizationInfo uqinfo = compute_requantization_scale_offset(
+ input->info()->quantization_info().uniform(), output->info()->quantization_info().uniform());
+ *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) =
+ quantize_qasymm8(curr_max, uqinfo);
}
else
{
@@ -112,13 +129,19 @@ NEROIPoolingLayerKernel::NEROIPoolingLayerKernel()
{
}
-Status NEROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIPoolingLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ const ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
return Status{};
}
-void NEROIPoolingLayerKernel::configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIPoolingLayerKernel::configure(const ITensor *input,
+ const ITensor *rois,
+ const ITensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
@@ -126,12 +149,15 @@ void NEROIPoolingLayerKernel::configure(const ITensor *input, const ITensor *roi
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
// Output auto initialization if not yet initialized
- TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->info()->dimension(1));
+ TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2),
+ rois->info()->dimension(1));
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), output->info()->quantization_info());
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+ output->info()->quantization_info());
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
+ ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) ||
+ (output->info()->dimension(1) != pool_info.pooled_height()));
// Set instance variables
_input = input;
@@ -167,7 +193,7 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
const auto *rois_ptr = reinterpret_cast<const uint16_t *>(_rois->buffer());
const auto data_type = _input->info()->data_type();
- for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
+ for (int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
{
const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx];
const auto x1 = rois_ptr[values_per_roi * roi_indx + 1];
@@ -182,30 +208,35 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
const int roi_height = std::max(support::cpp11::round((y2 - y1) * spatial_scale), 1.f);
// Iterate through all feature maps
- for(int fm = 0; fm < fms; ++fm)
+ for (int fm = 0; fm < fms; ++fm)
{
// Iterate through all output pixels
- for(int py = 0; py < pooled_h; ++py)
+ for (int py = 0; py < pooled_h; ++py)
{
- for(int px = 0; px < pooled_w; ++px)
+ for (int px = 0; px < pooled_w; ++px)
{
auto region_start_x = static_cast<int>(std::floor((static_cast<float>(px) / pooled_w) * roi_width));
- auto region_end_x = static_cast<int>(std::floor((static_cast<float>(px + 1) / pooled_w) * roi_width));
- auto region_start_y = static_cast<int>(std::floor((static_cast<float>(py) / pooled_h) * roi_height));
- auto region_end_y = static_cast<int>(std::floor((static_cast<float>(py + 1) / pooled_h) * roi_height));
+ auto region_end_x =
+ static_cast<int>(std::floor((static_cast<float>(px + 1) / pooled_w) * roi_width));
+ auto region_start_y =
+ static_cast<int>(std::floor((static_cast<float>(py) / pooled_h) * roi_height));
+ auto region_end_y =
+ static_cast<int>(std::floor((static_cast<float>(py + 1) / pooled_h) * roi_height));
region_start_x = std::min(std::max(region_start_x + roi_anchor_x, 0), width);
region_end_x = std::min(std::max(region_end_x + roi_anchor_x, 0), width);
region_start_y = std::min(std::max(region_start_y + roi_anchor_y, 0), height);
region_end_y = std::min(std::max(region_end_y + roi_anchor_y, 0), height);
- switch(data_type)
+ switch (data_type)
{
case DataType::F32:
- template_eval<float>(_input, _output, region_start_x, region_start_y, region_end_x, region_end_y, fm, px, py, roi_batch, roi_indx);
+ template_eval<float>(_input, _output, region_start_x, region_start_y, region_end_x,
+ region_end_y, fm, px, py, roi_batch, roi_indx);
break;
case DataType::QASYMM8:
- template_eval<qasymm8_t>(_input, _output, region_start_x, region_start_y, region_end_x, region_end_y, fm, px, py, roi_batch, roi_indx);
+ template_eval<qasymm8_t>(_input, _output, region_start_x, region_start_y, region_end_x,
+ region_end_y, fm, px, py, roi_batch, roi_indx);
break;
default:
ARM_COMPUTE_ERROR("DataType not Supported");
@@ -216,4 +247,4 @@ void NEROIPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
}
}
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.h b/src/core/NEON/kernels/NEROIPoolingLayerKernel.h
index e7a7e90eef..81f6006ea2 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.h
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.h
@@ -63,7 +63,8 @@ public:
* @note The z dimensions of @p output tensor and @p input tensor must be the same.
* @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois tensor.
*/
- void configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info);
+ void
+ configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
@@ -82,7 +83,10 @@ public:
*
* @return a Status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ const ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info);
private:
const ITensor *_input;
diff --git a/src/core/NEON/kernels/NERangeKernel.cpp b/src/core/NEON/kernels/NERangeKernel.cpp
index ec63a35de9..87b7b76b72 100644
--- a/src/core/NEON/kernels/NERangeKernel.cpp
+++ b/src/core/NEON/kernels/NERangeKernel.cpp
@@ -29,11 +29,12 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/common/Registrars.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include "src/cpu/kernels/range/list.h"
namespace arm_compute
@@ -55,48 +56,23 @@ struct RangeUKernel
RangeUKernelPtr ukernel;
};
-static const RangeUKernel available_kernels[] =
-{
- {
- "fp16_neon_range",
- [](const RangeSelectorData & data) { return data.dt == DataType::F16; },
- REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_range_function)
- },
- {
- "f32_neon_range",
- [](const RangeSelectorData & data) { return data.dt == DataType::F32; },
- REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_range_function)
- },
- {
- "u8_neon_range",
- [](const RangeSelectorData & data) { return data.dt == DataType::U8; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_range_function)
- },
- {
- "u16_neon_range",
- [](const RangeSelectorData & data) { return data.dt == DataType::U16; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::u16_neon_range_function)
- },
- {
- "u32_neon_range",
- [](const RangeSelectorData & data) { return data.dt == DataType::U32; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::u32_neon_range_function)
- },
- {
- "s8_neon_range",
- [](const RangeSelectorData & data) { return data.dt == DataType::S8; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_range_function)
- },
- {
- "s16_neon_range",
- [](const RangeSelectorData & data) { return data.dt == DataType::S16; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_range_function)
- },
- {
- "s32_neon_range",
- [](const RangeSelectorData & data) { return data.dt == DataType::S32; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::s32_neon_range_function)
- },
+static const RangeUKernel available_kernels[] = {
+ {"fp16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::F16; },
+ REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_range_function)},
+ {"f32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::F32; },
+ REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_range_function)},
+ {"u8_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U8; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_range_function)},
+ {"u16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U16; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::u16_neon_range_function)},
+ {"u32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U32; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::u32_neon_range_function)},
+ {"s8_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S8; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_range_function)},
+ {"s16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S16; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_range_function)},
+ {"s32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S32; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::s32_neon_range_function)},
};
/** Micro-kernel selector
@@ -107,9 +83,9 @@ static const RangeUKernel available_kernels[] =
*/
const RangeUKernel *get_implementation(const RangeSelectorData &data)
{
- for(const auto &uk : available_kernels)
+ for (const auto &uk : available_kernels)
{
- if(uk.is_selected(data))
+ if (uk.is_selected(data))
{
return &uk;
}
@@ -119,28 +95,31 @@ const RangeUKernel *get_implementation(const RangeSelectorData &data)
Status validate_arguments(const ITensorInfo &output, const float start, const float end, const float step)
{
- const auto *uk = get_implementation(RangeSelectorData{ output.data_type() });
+ const auto *uk = get_implementation(RangeSelectorData{output.data_type()});
ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start < end) && (step <= 0)), "step must be greater than 0 when start < end");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start > end) && (step >= 0)), "step must be less than 0 when start > end");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()), "start value is outside the range of the data type");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()), "end value is outside the range of the data type");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()), "step value is outside the range of the data type");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()),
+ "start value is outside the range of the data type");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()),
+ "end value is outside the range of the data type");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()),
+ "step value is outside the range of the data type");
ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.num_dimensions() != 1, "Output has to be a 1-D tensor");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step),
+ "Output tensor size is incorrect");
return Status{};
}
} // namespace
-NERangeKernel::NERangeKernel()
- : _start(0), _end(1), _step(1), _output(nullptr)
+NERangeKernel::NERangeKernel() : _start(0), _end(1), _step(1), _output(nullptr)
{
}
@@ -151,7 +130,8 @@ void NERangeKernel::configure(ITensor *output, float start, float end, float ste
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*(output->info()), start, end, step));
// Auto initialize output if not initialized
- auto_init_if_empty(*output->info(), TensorShape(num_of_elements_in_range(start, end, step)), 1, output->info()->data_type(), output->info()->quantization_info());
+ auto_init_if_empty(*output->info(), TensorShape(num_of_elements_in_range(start, end, step)), 1,
+ output->info()->data_type(), output->info()->quantization_info());
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps());
@@ -178,7 +158,7 @@ void NERangeKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- const auto *uk = get_implementation(RangeSelectorData{ _output->info()->data_type() });
+ const auto *uk = get_implementation(RangeSelectorData{_output->info()->data_type()});
uk->ukernel(_output, _start, _step, window);
}
diff --git a/src/core/NEON/kernels/NERangeKernel.h b/src/core/NEON/kernels/NERangeKernel.h
index 90560995e6..fa555c2c2e 100644
--- a/src/core/NEON/kernels/NERangeKernel.h
+++ b/src/core/NEON/kernels/NERangeKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NERANGEKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 19955af493..455d604b3b 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -28,16 +28,17 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/NEON/NEMath.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include "support/SaturateCast.h"
-#include "src/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
namespace arm_compute
@@ -48,7 +49,7 @@ namespace
template <typename T>
void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output, int offset = 0)
{
- if(std::is_same<T, uint8_t>::value)
+ if (std::is_same<T, uint8_t>::value)
{
auto res = wrapper::vcombine(wrapper::vqmovun(t1), wrapper::vqmovun(t2));
wrapper::vstore(output.ptr() + offset, res);
@@ -63,8 +64,8 @@ void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output, int offset
template <typename T>
uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis)
{
- uint32x4_t mask{ 0 };
- if(op == ReductionOperation::ARG_IDX_MIN)
+ uint32x4_t mask{0};
+ if (op == ReductionOperation::ARG_IDX_MIN)
{
mask = wrapper::vcgt(b, a);
}
@@ -73,12 +74,12 @@ uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOp
mask = wrapper::vclt(b, a);
}
- uint32x4_t vec_idx = { idx, idx + 1, idx + 2, idx + 3 };
- if(axis != 0)
+ uint32x4_t vec_idx = {idx, idx + 1, idx + 2, idx + 3};
+ if (axis != 0)
{
vec_idx = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
}
- uint32x4x4_t res = { { wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0 } };
+ uint32x4x4_t res = {{wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0}};
return res;
}
@@ -86,9 +87,9 @@ uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOp
template <typename T>
uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis)
{
- uint32x4x4_t mask{ { 0 } };
- uint8x16_t mask_u8{ 0 };
- if(op == ReductionOperation::ARG_IDX_MIN)
+ uint32x4x4_t mask{{0}};
+ uint8x16_t mask_u8{0};
+ if (op == ReductionOperation::ARG_IDX_MIN)
{
mask_u8 = wrapper::vcgt(b, a);
}
@@ -96,44 +97,43 @@ uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c, R
{
mask_u8 = wrapper::vclt(b, a);
}
- auto wide_u16_1 = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
- auto wide_u16_2 = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
- mask.val[0] = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
- mask.val[1] = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
- mask.val[2] = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
- mask.val[3] = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
-
- uint32x4x4_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
- { idx + 4, idx + 5, idx + 6, idx + 7 },
- { idx + 8, idx + 9, idx + 10, idx + 11 },
- { idx + 12, idx + 13, idx + 14, idx + 15 }
- }
- };
- if(axis != 0)
+ auto wide_u16_1 =
+ wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
+ auto wide_u16_2 =
+ wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
+ mask.val[0] =
+ wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
+ mask.val[1] =
+ wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
+ mask.val[2] =
+ wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
+ mask.val[3] =
+ wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
+
+ uint32x4x4_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3},
+ {idx + 4, idx + 5, idx + 6, idx + 7},
+ {idx + 8, idx + 9, idx + 10, idx + 11},
+ {idx + 12, idx + 13, idx + 14, idx + 15}}};
+ if (axis != 0)
{
vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
vec_idx.val[2] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
vec_idx.val[3] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
}
- uint32x4x4_t res =
- {
- {
- vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]),
- vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]),
- vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]),
- vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])
- }
- };
+ uint32x4x4_t res = {
+ {vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]), vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]),
+ vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]), vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])}};
return res;
}
// Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value.
template <typename T>
-inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
- typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type >::type
- calculate_min(T in)
+inline typename std::enable_if<
+ std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
+ typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type>::type
+calculate_min(T in)
{
auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
return wrapper::vpmin(pmin, pmin);
@@ -141,9 +141,10 @@ inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_
// Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value.
template <typename T>
-inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
- typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type >::type
- calculate_min(T in)
+inline typename std::enable_if<
+ std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
+ typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type>::type
+calculate_min(T in)
{
auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
pmin = wrapper::vpmin(pmin, pmin);
@@ -153,9 +154,10 @@ inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_s
// Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value.
template <typename T>
-inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
- typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type >::type
- calculate_max(T in)
+inline typename std::enable_if<
+ std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
+ typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type>::type
+calculate_max(T in)
{
auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
return wrapper::vpmax(pmax, pmax);
@@ -163,9 +165,10 @@ inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_
// Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value.
template <typename T>
-inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
- typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type >::type
- calculate_max(T in)
+inline typename std::enable_if<
+ std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
+ typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type>::type
+calculate_max(T in)
{
auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
pmax = wrapper::vpmax(pmax, pmax);
@@ -176,10 +179,10 @@ inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_s
template <typename T>
uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op)
{
- uint32x4_t res_idx_mask{ 0 };
+ uint32x4_t res_idx_mask{0};
uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
- if(op == ReductionOperation::ARG_IDX_MIN)
+ if (op == ReductionOperation::ARG_IDX_MIN)
{
auto pmin = calculate_min(vec_res_value);
auto mask = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
@@ -203,10 +206,10 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, Reduc
template <typename T>
uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op)
{
- uint32x4x4_t res_idx_mask{ { 0 } };
+ uint32x4x4_t res_idx_mask{{0}};
uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
- uint8x16_t mask_u8{ 0 };
- if(op == ReductionOperation::ARG_IDX_MIN)
+ uint8x16_t mask_u8{0};
+ if (op == ReductionOperation::ARG_IDX_MIN)
{
auto pmin = calculate_min(vec_res_value);
mask_u8 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
@@ -218,12 +221,18 @@ uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_va
}
// Widen vectors
- auto wide_u16_1 = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
- auto wide_u16_2 = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
- auto wide_u32_1 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
- auto wide_u32_2 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
- auto wide_u32_3 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
- auto wide_u32_4 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
+ auto wide_u16_1 =
+ wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
+ auto wide_u16_2 =
+ wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
+ auto wide_u32_1 =
+ wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
+ auto wide_u32_2 =
+ wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
+ auto wide_u32_3 =
+ wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
+ auto wide_u32_4 =
+ wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
res_idx_mask.val[2] = wrapper::vand(vec_res_idx.val[2], wide_u32_3);
@@ -241,19 +250,19 @@ uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_va
pmin = wrapper::vpmin(pmin, pmin);
res = std::min(wrapper::vgetlane(pmin, 0), res);
iter++;
- }
- while(iter < 4);
+ } while (iter < 4);
return (res - 0xFFFFFFFF);
}
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
template <>
-uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+uint32x4x4_t
+calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis)
{
- uint32x4x2_t mask{ 0 };
- uint16x8_t mask_u16{ 0 };
- if(op == ReductionOperation::ARG_IDX_MIN)
+ uint32x4x2_t mask{0};
+ uint16x8_t mask_u16{0};
+ if (op == ReductionOperation::ARG_IDX_MIN)
{
mask_u16 = wrapper::vcgt(b, a);
}
@@ -263,19 +272,14 @@ uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x
}
mask.val[0] = wrapper::vmovl(wrapper::vgetlow(mask_u16));
mask.val[1] = wrapper::vmovl(wrapper::vgethigh(mask_u16));
- uint32x4x2_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
- { idx + 4, idx + 5, idx + 6, idx + 7 }
- }
- };
- if(axis != 0)
+ uint32x4x2_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3}, {idx + 4, idx + 5, idx + 6, idx + 7}}};
+ if (axis != 0)
{
vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
}
- uint32x4x4_t res = { wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]),
- wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]),
- 0, 0
- };
+ uint32x4x4_t res = {wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]),
+ wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]), 0, 0};
return res;
}
@@ -298,10 +302,10 @@ inline float16x4_t calculate_max(float16x8_t in)
template <>
uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value, ReductionOperation op)
{
- uint32x4x2_t res_idx_mask{ 0 };
+ uint32x4x2_t res_idx_mask{0};
uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
uint16x8_t mask_u16;
- if(op == ReductionOperation::ARG_IDX_MIN)
+ if (op == ReductionOperation::ARG_IDX_MIN)
{
auto pmin = calculate_min(vec_res_value);
mask_u16 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
@@ -313,8 +317,10 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_va
}
// Widen vectors
- auto wide_u32_1 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16)));
- auto wide_u32_2 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16)));
+ auto wide_u32_1 =
+ wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16)));
+ auto wide_u32_2 =
+ wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16)));
res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
@@ -328,8 +334,7 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_va
pmin = wrapper::vpmin(pmin, pmin);
res = std::min(wrapper::vgetlane(pmin, 0), res);
iter++;
- }
- while(iter < 2);
+ } while (iter < 2);
return (res - 0xFFFFFFFF);
}
@@ -388,7 +393,8 @@ struct RedOpX
/** SIMD vector tag type. */
using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
- inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
+ inline void operator()(
+ const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
{
const size_t input_dim_0 = in->info()->dimension(0);
const int window_step_x = 16 / sizeof(T);
@@ -402,211 +408,217 @@ struct RedOpX
Iterator output(out, out_window);
execute_window_loop(
- in_win_no_pad, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
-
- auto init_res_value = static_cast<T>(0.f);
- switch(op)
+ in_win_no_pad,
+ [&](const Coordinates &)
{
- case ReductionOperation::ARG_IDX_MAX:
- case ReductionOperation::ARG_IDX_MIN:
- case ReductionOperation::MIN:
- case ReductionOperation::MAX:
- {
- init_res_value = static_cast<T>(*input_ptr);
- break;
- }
- case ReductionOperation::PROD:
- {
- init_res_value = static_cast<T>(1.f);
- break;
- }
- default:
- break;
- }
- auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
- uint32x4x4_t vec_res_idx{ { 0 } };
+ const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vec_elements = wrapper::vloadq(input_ptr + x);
- switch(op)
+ auto init_res_value = static_cast<T>(0.f);
+ switch (op)
{
- case ReductionOperation::SUM_SQUARE:
- vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
- break;
- case ReductionOperation::MEAN_SUM:
- case ReductionOperation::SUM:
- vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
- break;
- case ReductionOperation::PROD:
- vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
- break;
- case ReductionOperation::ARG_IDX_MIN:
- {
- auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
- vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
- vec_res_value = temp_vec_res_value;
- break;
- }
case ReductionOperation::ARG_IDX_MAX:
- {
- auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
- vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
- vec_res_value = temp_vec_res_value;
- break;
- }
+ case ReductionOperation::ARG_IDX_MIN:
case ReductionOperation::MIN:
+ case ReductionOperation::MAX:
{
- vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ init_res_value = static_cast<T>(*input_ptr);
break;
}
- case ReductionOperation::MAX:
+ case ReductionOperation::PROD:
{
- vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ init_res_value = static_cast<T>(1.f);
break;
}
default:
- ARM_COMPUTE_ERROR("Not supported");
+ break;
}
- }
+ auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
+ uint32x4x4_t vec_res_idx{{0}};
- switch(op)
- {
- case ReductionOperation::SUM:
- case ReductionOperation::MEAN_SUM:
- case ReductionOperation::SUM_SQUARE:
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
-#ifdef ARM_COMPUTE_DEBUG_ENABLED
- auto res = static_cast<T>(0.f);
- for(int i = 0; i < S; ++i)
+ const auto vec_elements = wrapper::vloadq(input_ptr + x);
+ switch (op)
{
- res += wrapper::vgetlane(vec_res_value, i);
+ case ReductionOperation::SUM_SQUARE:
+ vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+ break;
+ case ReductionOperation::MEAN_SUM:
+ case ReductionOperation::SUM:
+ vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+ break;
+ case ReductionOperation::PROD:
+ vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
+ break;
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value,
+ vec_res_idx, op, 0);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value,
+ vec_res_idx, op, 0);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::MIN:
+ {
+ vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ break;
+ }
+ case ReductionOperation::MAX:
+ {
+ vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
-#else // ARM_COMPUTE_DEBUG_ENABLED
- auto carry_res = wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
- for(int i = 0; i < S / 4; ++i)
+ }
+
+ switch (op)
+ {
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ case ReductionOperation::SUM_SQUARE:
{
- carry_res = wrapper::vpadd(carry_res, carry_res);
- }
- auto res = wrapper::vgetlane(carry_res, 0);
+#ifdef ARM_COMPUTE_DEBUG_ENABLED
+ auto res = static_cast<T>(0.f);
+ for (int i = 0; i < S; ++i)
+ {
+ res += wrapper::vgetlane(vec_res_value, i);
+ }
+#else // ARM_COMPUTE_DEBUG_ENABLED
+ auto carry_res =
+ wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ for (int i = 0; i < S / 4; ++i)
+ {
+ carry_res = wrapper::vpadd(carry_res, carry_res);
+ }
+ auto res = wrapper::vgetlane(carry_res, 0);
#endif // ARM_COMPUTE_DEBUG_ENABLED
- if(op == ReductionOperation::SUM_SQUARE)
- {
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ if (op == ReductionOperation::SUM_SQUARE)
+ {
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ res += (*(input_ptr + x)) * (*(input_ptr + x));
+ }
+ }
+ else
{
- res += (*(input_ptr + x)) * (*(input_ptr + x));
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ res += *(input_ptr + x);
+ }
}
+
+ if (op == ReductionOperation::MEAN_SUM)
+ {
+ res /= input_dim_0;
+ }
+
+ *(reinterpret_cast<T *>(output.ptr())) = res;
+ break;
}
- else
+ case ReductionOperation::PROD:
{
+ auto carry_res =
+ wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+ T res = 1;
+ for (int i = 0; i < S / 2; ++i)
+ {
+ res *= wrapper::vgetlane(carry_res, i);
+ }
+
// Compute left-over elements
- for(; x < window_end_x; ++x)
+ for (; x < window_end_x; ++x)
{
- res += *(input_ptr + x);
+ res *= *(input_ptr + x);
}
- }
- if(op == ReductionOperation::MEAN_SUM)
- {
- res /= input_dim_0;
+ *(reinterpret_cast<T *>(output.ptr())) = res;
+ break;
}
-
- *(reinterpret_cast<T *>(output.ptr())) = res;
- break;
- }
- case ReductionOperation::PROD:
- {
- auto carry_res = wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
- T res = 1;
- for(int i = 0; i < S / 2; ++i)
+ case ReductionOperation::ARG_IDX_MIN:
{
- res *= wrapper::vgetlane(carry_res, i);
- }
+ auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+ auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- res *= *(input_ptr + x);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ if (*(input_ptr + x) < res)
+ {
+ idx = x;
+ res = *(input_ptr + x);
+ }
+ }
+ *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
+ break;
}
-
- *(reinterpret_cast<T *>(output.ptr())) = res;
- break;
- }
- case ReductionOperation::ARG_IDX_MIN:
- {
- auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
- auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ case ReductionOperation::ARG_IDX_MAX:
{
- if(*(input_ptr + x) < res)
+ auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+ auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- idx = x;
- res = *(input_ptr + x);
+ if (*(input_ptr + x) > res)
+ {
+ idx = x;
+ res = *(input_ptr + x);
+ }
}
+ *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
+ break;
}
- *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
- break;
- }
- case ReductionOperation::ARG_IDX_MAX:
- {
- auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
- auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ case ReductionOperation::MIN:
{
- if(*(input_ptr + x) > res)
+ auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- idx = x;
- res = *(input_ptr + x);
+ res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
}
+ *(reinterpret_cast<T *>(output.ptr())) = res;
+ break;
}
- *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
- break;
- }
- case ReductionOperation::MIN:
- {
- auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ case ReductionOperation::MAX:
{
- res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
- }
- *(reinterpret_cast<T *>(output.ptr())) = res;
- break;
- }
- case ReductionOperation::MAX:
- {
- auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+ auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
+ }
+ *(reinterpret_cast<T *>(output.ptr())) = res;
+ break;
}
- *(reinterpret_cast<T *>(output.ptr())) = res;
- break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- },
- input, output);
+ },
+ input, output);
}
};
template <typename T>
struct RedOpX_quantized
{
- inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
+ inline void operator()(
+ const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
{
using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
@@ -637,246 +649,257 @@ struct RedOpX_quantized
const float B = out_offset - (in_scale * in_offset) / (out_scale);
execute_window_loop(
- in_win_no_pad, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<T *>(input.ptr());
+ in_win_no_pad,
+ [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<T *>(input.ptr());
+
+ auto vec_res_value1 =
+ wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+ auto vec_res_value2 =
+ wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+ auto vec_res_value3 =
+ wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+ auto vec_res_value4 =
+ wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+
+ auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f));
+ auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f));
+ auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f));
+ auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f));
+
+ typename wrapper::traits::neon_vector<T, 16>::type vec_res_value = {0};
+
+ if (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN ||
+ op == ReductionOperation::MIN || op == ReductionOperation::MAX)
+ {
+ vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{});
+ }
+
+ uint32x4x4_t vec_res_idx{{0}};
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto vec_elements = wrapper::vloadq(input_ptr + x);
+ switch (op)
+ {
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ {
+ const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+ const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
- auto vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
- auto vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
- auto vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
- auto vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+ const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+ const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+ const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+ const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
- auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f));
- auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f));
- auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f));
- auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f));
+ vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+ vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+ vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+ vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+ break;
+ }
+ case ReductionOperation::PROD:
+ {
+ const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset);
+ const auto scale32x4f_4 = vdupq_n_f32(iq_info.scale);
- typename wrapper::traits::neon_vector<T, 16>::type vec_res_value = { 0 };
+ const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+ const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
- if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::MIN || op == ReductionOperation::MAX)
- {
- vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{});
- }
+ const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+ const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+ const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+ const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
- uint32x4x4_t vec_res_idx{ { 0 } };
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
- {
- const auto vec_elements = wrapper::vloadq(input_ptr + x);
- switch(op)
- {
- case ReductionOperation::SUM:
- case ReductionOperation::MEAN_SUM:
- {
- const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
- const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
- const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
- const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
- const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
- const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
- vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
- vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
- vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
- vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
- break;
- }
- case ReductionOperation::PROD:
- {
- const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset);
- const auto scale32x4f_4 = vdupq_n_f32(iq_info.scale);
-
- const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
- const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
- const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
- const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
- const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
- const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
- auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
- auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
- auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
- auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
-
- //de-quantize vec_elements
- temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
- temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
- temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
- temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
-
- vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
- vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
- vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
- vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
- break;
+ auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
+ auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
+ auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
+ auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
+
+ //de-quantize vec_elements
+ temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
+ temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
+ temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
+ temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
+
+ vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
+ vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
+ vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
+ vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>(
+ x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>(
+ x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::MIN:
+ {
+ vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ break;
+ }
+ case ReductionOperation::MAX:
+ {
+ vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
+ }
+
+ switch (op)
+ {
case ReductionOperation::ARG_IDX_MIN:
{
- auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
- vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
- vec_res_value = temp_vec_res_value;
+ auto idx =
+ calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+ auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ if (*(input_ptr + x) < res)
+ {
+ idx = x;
+ res = *(input_ptr + x);
+ }
+ }
+ *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
break;
}
case ReductionOperation::ARG_IDX_MAX:
{
- auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
- vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
- vec_res_value = temp_vec_res_value;
+ auto idx =
+ calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+ auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ if (*(input_ptr + x) > res)
+ {
+ idx = x;
+ res = *(input_ptr + x);
+ }
+ }
+ *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
break;
}
case ReductionOperation::MIN:
{
- vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
- break;
- }
- case ReductionOperation::MAX:
- {
- vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- }
-
- switch(op)
- {
- case ReductionOperation::ARG_IDX_MIN:
- {
- auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
- auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+ auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- if(*(input_ptr + x) < res)
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- idx = x;
- res = *(input_ptr + x);
+ res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
}
+ *(reinterpret_cast<T *>(output.ptr())) = res;
+ break;
}
- *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
- break;
- }
- case ReductionOperation::ARG_IDX_MAX:
- {
- auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
- auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ case ReductionOperation::MAX:
{
- if(*(input_ptr + x) > res)
+ auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- idx = x;
- res = *(input_ptr + x);
+ res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
}
+ *(reinterpret_cast<T *>(output.ptr())) = res;
+ break;
}
- *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
- break;
- }
- case ReductionOperation::MIN:
- {
- auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
+ case ReductionOperation::PROD:
{
- res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
- }
- *(reinterpret_cast<T *>(output.ptr())) = res;
- break;
- }
- case ReductionOperation::MAX:
- {
- auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+ auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f);
+ carry_res = wrapper::vmul(carry_res, vec_res_value3_f);
+ carry_res = wrapper::vmul(carry_res, vec_res_value4_f);
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
- }
- *(reinterpret_cast<T *>(output.ptr())) = res;
- break;
- }
- case ReductionOperation::PROD:
- {
- auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f);
- carry_res = wrapper::vmul(carry_res, vec_res_value3_f);
- carry_res = wrapper::vmul(carry_res, vec_res_value4_f);
+ float res = wrapper::vgetlane(carry_res, 0);
+ res *= wrapper::vgetlane(carry_res, 1);
+ res *= wrapper::vgetlane(carry_res, 2);
+ res *= wrapper::vgetlane(carry_res, 3);
- float res = wrapper::vgetlane(carry_res, 0);
- res *= wrapper::vgetlane(carry_res, 1);
- res *= wrapper::vgetlane(carry_res, 2);
- res *= wrapper::vgetlane(carry_res, 3);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ //de-quantize input
+ if (std::is_same<T, uint8_t>::value)
+ {
+ res *= dequantize_qasymm8(*(input_ptr + x), iq_info);
+ }
+ else
+ {
+ res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info);
+ }
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- //de-quantize input
- if(std::is_same<T, uint8_t>::value)
+ //re-quantize result
+ if (std::is_same<T, uint8_t>::value)
{
- res *= dequantize_qasymm8(*(input_ptr + x), iq_info);
+ res = quantize_qasymm8(res, iq_info);
}
else
{
- res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info);
+ res = quantize_qasymm8_signed(res, iq_info);
}
- }
- //re-quantize result
- if(std::is_same<T, uint8_t>::value)
- {
- res = quantize_qasymm8(res, iq_info);
+ *reinterpret_cast<T *>(output.ptr()) = static_cast<T>(res);
+ break;
}
- else
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
{
- res = quantize_qasymm8_signed(res, iq_info);
- }
+ auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
+ carry_res = wrapper::vadd(carry_res, vec_res_value3);
+ carry_res = wrapper::vadd(carry_res, vec_res_value4);
- *reinterpret_cast<T *>(output.ptr()) = static_cast<T>(res);
- break;
- }
- case ReductionOperation::SUM:
- case ReductionOperation::MEAN_SUM:
- {
- auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
- carry_res = wrapper::vadd(carry_res, vec_res_value3);
- carry_res = wrapper::vadd(carry_res, vec_res_value4);
+ auto carry_paddition =
+ wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
+ carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition);
+ auto res = static_cast<int32_t>(wrapper::vgetlane(carry_paddition, 0));
- auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
- carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition);
- auto res = static_cast<int32_t>(wrapper::vgetlane(carry_paddition, 0));
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ res += *(input_ptr + x);
+ }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- res += *(input_ptr + x);
- }
+ if (op == ReductionOperation::MEAN_SUM)
+ {
+ const int32_t resFinal = A * (static_cast<float>(res)) + B;
- if(op == ReductionOperation::MEAN_SUM)
- {
- const int32_t resFinal = A * (static_cast<float>(res)) + B;
+ *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(resFinal);
+ }
+ else
+ {
+ // Subtract accumulated offsets
+ res -= (in_info.dimension(0) - 1) * iq_info.offset;
+ *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res);
+ }
- *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(resFinal);
- }
- else
- {
- // Subtract accumulated offsets
- res -= (in_info.dimension(0) - 1) * iq_info.offset;
- *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res);
+ break;
}
-
- break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- },
- input, output);
+ },
+ input, output);
}
};
@@ -887,7 +910,12 @@ struct RedOpYZW
using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
using neon_vector = typename wrapper::traits::neon_vector<T, S>::type;
- inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op)
+ inline void operator()(const Window &in_window,
+ Window &out_window,
+ const ITensor *in,
+ ITensor *out,
+ int axis,
+ const ReductionOperation op)
{
const TensorInfo in_info = *(in->info());
const int window_step_x = 16 / sizeof(T);
@@ -900,203 +928,210 @@ struct RedOpYZW
Window in_win_no_pad = in_window;
in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
Window out_win_no_pad = out_window;
- out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+ out_win_no_pad.set(Window::DimX,
+ Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
Iterator input(in, in_win_no_pad);
Iterator output(out, out_win_no_pad);
execute_window_loop(
- in_win_no_pad, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<T *>(input.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ in_win_no_pad,
+ [&](const Coordinates &)
{
- neon_vector vec_res_value = { 0 };
- switch(op)
- {
- case ReductionOperation::ARG_IDX_MAX:
- case ReductionOperation::ARG_IDX_MIN:
- case ReductionOperation::MIN:
- case ReductionOperation::MAX:
- {
- vec_res_value = wrapper::vloadq(input_ptr + x);
- break;
- }
- case ReductionOperation::PROD:
- {
- vec_res_value = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
- break;
- }
- default:
- {
- vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
- break;
- }
- }
- uint32x4x4_t vec_res_idx{ { 0 } };
+ const auto input_ptr = reinterpret_cast<T *>(input.ptr());
- for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- const T *in_ptr = reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
- const auto vec_elements = wrapper::vloadq(in_ptr);
- switch(op)
+ neon_vector vec_res_value = {0};
+ switch (op)
{
- case ReductionOperation::SUM:
- case ReductionOperation::MEAN_SUM:
- vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
- break;
- case ReductionOperation::SUM_SQUARE:
- vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
- break;
- case ReductionOperation::PROD:
- vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
- break;
- case ReductionOperation::ARG_IDX_MIN:
- {
- auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
- vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
- vec_res_value = temp_vec_res_value;
- break;
- }
case ReductionOperation::ARG_IDX_MAX:
+ case ReductionOperation::ARG_IDX_MIN:
+ case ReductionOperation::MIN:
+ case ReductionOperation::MAX:
{
- auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
- vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
- vec_res_value = temp_vec_res_value;
+ vec_res_value = wrapper::vloadq(input_ptr + x);
break;
}
- case ReductionOperation::MIN:
+ case ReductionOperation::PROD:
{
- vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_value = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
break;
}
- case ReductionOperation::MAX:
+ default:
{
- vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
break;
}
- default:
- ARM_COMPUTE_ERROR("Not supported");
}
- }
-
- if(op == ReductionOperation::MEAN_SUM)
- {
- auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
- vec_res_value = wrapper::vmul(vec_res_value, vec_width_inv);
- }
+ uint32x4x4_t vec_res_idx{{0}};
- if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
- {
- wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x, vec_res_idx.val[0]);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- if(std::is_same<T, float16_t>::value)
+ for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
{
- wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x + 4, vec_res_idx.val[1]);
+ const T *in_ptr =
+ reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
+ const auto vec_elements = wrapper::vloadq(in_ptr);
+ switch (op)
+ {
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+ break;
+ case ReductionOperation::SUM_SQUARE:
+ vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+ break;
+ case ReductionOperation::PROD:
+ vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
+ break;
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_idx =
+ calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_idx =
+ calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::MIN:
+ {
+ vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ break;
+ }
+ case ReductionOperation::MAX:
+ {
+ vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
+ }
}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- }
- else
- {
- wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x * sizeof(T)), vec_res_value);
- }
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- auto res_value = 0.f;
- switch(op)
- {
- case ReductionOperation::ARG_IDX_MAX:
- case ReductionOperation::ARG_IDX_MIN:
- case ReductionOperation::MIN:
- case ReductionOperation::MAX:
+ if (op == ReductionOperation::MEAN_SUM)
{
- res_value = *(input_ptr + x);
- break;
+ auto vec_width_inv =
+ wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
+ vec_res_value = wrapper::vmul(vec_res_value, vec_width_inv);
}
- case ReductionOperation::PROD:
+
+ if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
{
- res_value = static_cast<T>(1.f);
- break;
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x, vec_res_idx.val[0]);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ if (std::is_same<T, float16_t>::value)
+ {
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x + 4, vec_res_idx.val[1]);
+ }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
}
- default:
+ else
{
- res_value = static_cast<T>(0.f);
- break;
+ wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x * sizeof(T)), vec_res_value);
}
}
- uint32_t res_idx = 0;
- for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- const T *in_ptr = reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
-
- switch(op)
+ auto res_value = 0.f;
+ switch (op)
{
- case ReductionOperation::SUM:
- case ReductionOperation::MEAN_SUM:
- res_value += *in_ptr;
- break;
- case ReductionOperation::SUM_SQUARE:
- res_value += *in_ptr * *in_ptr;
- break;
- case ReductionOperation::PROD:
- res_value *= *in_ptr;
- break;
+ case ReductionOperation::ARG_IDX_MAX:
case ReductionOperation::ARG_IDX_MIN:
+ case ReductionOperation::MIN:
+ case ReductionOperation::MAX:
{
- if(*in_ptr < res_value)
- {
- res_value = *in_ptr;
- res_idx = dim;
- }
+ res_value = *(input_ptr + x);
break;
}
- case ReductionOperation::ARG_IDX_MAX:
+ case ReductionOperation::PROD:
{
- if(*in_ptr > res_value)
- {
- res_value = *in_ptr;
- res_idx = dim;
- }
+ res_value = static_cast<T>(1.f);
break;
}
- case ReductionOperation::MIN:
+ default:
{
- res_value = *in_ptr < res_value ? *in_ptr : res_value;
+ res_value = static_cast<T>(0.f);
break;
}
- case ReductionOperation::MAX:
+ }
+
+ uint32_t res_idx = 0;
+ for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+ {
+ const T *in_ptr =
+ reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
+
+ switch (op)
{
- res_value = *in_ptr > res_value ? *in_ptr : res_value;
- break;
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ res_value += *in_ptr;
+ break;
+ case ReductionOperation::SUM_SQUARE:
+ res_value += *in_ptr * *in_ptr;
+ break;
+ case ReductionOperation::PROD:
+ res_value *= *in_ptr;
+ break;
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ if (*in_ptr < res_value)
+ {
+ res_value = *in_ptr;
+ res_idx = dim;
+ }
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ if (*in_ptr > res_value)
+ {
+ res_value = *in_ptr;
+ res_idx = dim;
+ }
+ break;
+ }
+ case ReductionOperation::MIN:
+ {
+ res_value = *in_ptr < res_value ? *in_ptr : res_value;
+ break;
+ }
+ case ReductionOperation::MAX:
+ {
+ res_value = *in_ptr > res_value ? *in_ptr : res_value;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
- default:
- ARM_COMPUTE_ERROR("Not supported");
}
- }
- if(op == ReductionOperation::MEAN_SUM)
- {
- res_value /= in_info.dimension(axis);
- }
+ if (op == ReductionOperation::MEAN_SUM)
+ {
+ res_value /= in_info.dimension(axis);
+ }
- if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
- {
- *(reinterpret_cast<uint32_t *>(output.ptr()) + x) = res_idx;
- }
- else
- {
- *(reinterpret_cast<T *>(output.ptr() + x * sizeof(T))) = res_value;
+ if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
+ {
+ *(reinterpret_cast<uint32_t *>(output.ptr()) + x) = res_idx;
+ }
+ else
+ {
+ *(reinterpret_cast<T *>(output.ptr() + x * sizeof(T))) = res_value;
+ }
}
- }
- },
- input, output);
+ },
+ input, output);
}
};
@@ -1107,7 +1142,8 @@ struct RedOpYZW_complex
using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
using neon_vector = typename wrapper::traits::neon_vector<T, S>::type;
- inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation)
+ inline void operator()(
+ const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation)
{
ARM_COMPUTE_ERROR_ON(axis != 2);
ARM_COMPUTE_ERROR_ON(op != ReductionOperation::SUM);
@@ -1124,70 +1160,77 @@ struct RedOpYZW_complex
Window in_win_no_pad = in_window;
in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
Window out_win_no_pad = out_window;
- out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+ out_win_no_pad.set(Window::DimX,
+ Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
Iterator input(in, in_win_no_pad);
Iterator output(out, out_win_no_pad);
execute_window_loop(
- in_win_no_pad, [&](const Coordinates &)
- {
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ in_win_no_pad,
+ [&](const Coordinates &)
{
- neon_vector vec_res_value_0 = { 0 };
- neon_vector vec_res_value_1 = { 0 };
-
- vec_res_value_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
- vec_res_value_1 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-
- T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
- for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- T *in_ptr_0 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
- T *in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim);
+ neon_vector vec_res_value_0 = {0};
+ neon_vector vec_res_value_1 = {0};
- const auto vec_elements_0 = wrapper::vloadq(in_ptr_0);
- const auto vec_elements_1 = wrapper::vloadq(in_ptr_1);
+ vec_res_value_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+ vec_res_value_1 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
- vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0);
- vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1);
- }
+ T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
+ for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+ {
+ T *in_ptr_0 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
+ T *in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim);
- wrapper::vstore(out_ptr, vec_res_value_0);
- wrapper::vstore(out_ptr + 4, vec_res_value_1);
- }
+ const auto vec_elements_0 = wrapper::vloadq(in_ptr_0);
+ const auto vec_elements_1 = wrapper::vloadq(in_ptr_1);
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- auto res_value_0 = 0.f;
- auto res_value_1 = 0.f;
+ vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0);
+ vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1);
+ }
- T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
- for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+ wrapper::vstore(out_ptr, vec_res_value_0);
+ wrapper::vstore(out_ptr + 4, vec_res_value_1);
+ }
+
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- T *in_ptr = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
- res_value_0 += *in_ptr;
- res_value_1 += *(in_ptr + 1);
+ auto res_value_0 = 0.f;
+ auto res_value_1 = 0.f;
+
+ T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
+ for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+ {
+ T *in_ptr = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
+ res_value_0 += *in_ptr;
+ res_value_1 += *(in_ptr + 1);
+ }
+ *out_ptr = res_value_0;
+ *(out_ptr + 1) = res_value_1;
}
- *out_ptr = res_value_0;
- *(out_ptr + 1) = res_value_1;
- }
- },
- input, output);
+ },
+ input, output);
}
};
template <typename T>
struct RedOpYZW_quantized
{
- inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op)
+ inline void operator()(const Window &in_window,
+ Window &out_window,
+ const ITensor *in,
+ ITensor *out,
+ int axis,
+ const ReductionOperation op)
{
const TensorInfo in_info = *(in->info());
const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
- using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
+ using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
const auto oq_info = out->info()->quantization_info().uniform();
@@ -1201,12 +1244,14 @@ struct RedOpYZW_quantized
Window in_win_no_pad = in_window;
in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
Window out_win_no_pad = out_window;
- out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+ out_win_no_pad.set(Window::DimX,
+ Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
Iterator input(in, in_win_no_pad);
Iterator output(out, out_win_no_pad);
- using vector_type = typename wrapper::traits::neon_bitvector<PromotedType, wrapper::traits::BitWidth::W128>::type;
+ using vector_type =
+ typename wrapper::traits::neon_bitvector<PromotedType, wrapper::traits::BitWidth::W128>::type;
using vector_type_f = typename wrapper::traits::neon_vector<float, 4>::type;
vector_type vec_res_value1{};
@@ -1234,362 +1279,384 @@ struct RedOpYZW_quantized
const auto vec_B = wrapper::vdup_n(static_cast<float>(B), wrapper::traits::vector_128_tag{});
execute_window_loop(
- in_win_no_pad, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<T *>(input.ptr());
-
- // Compute window_step_x elements per iteration
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ in_win_no_pad,
+ [&](const Coordinates &)
{
- uint32x4x4_t vec_res_idx{ { 0 } };
- vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
- vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
- vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
- vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+ const auto input_ptr = reinterpret_cast<T *>(input.ptr());
- vec_res_value1_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
- vec_res_value2_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
- vec_res_value3_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
- vec_res_value4_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+ // Compute window_step_x elements per iteration
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ uint32x4x4_t vec_res_idx{{0}};
+ vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+ vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+ vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+ vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
- auto vec_res_value = wrapper::vloadq(input_ptr + x);
+ vec_res_value1_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+ vec_res_value2_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+ vec_res_value3_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+ vec_res_value4_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
- for(unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
- {
- const T *in_ptr = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim;
- const auto vec_elements = wrapper::vloadq(in_ptr);
- switch(op)
+ auto vec_res_value = wrapper::vloadq(input_ptr + x);
+
+ for (unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
{
- case ReductionOperation::SUM:
- case ReductionOperation::MEAN_SUM:
+ const T *in_ptr = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim;
+ const auto vec_elements = wrapper::vloadq(in_ptr);
+ switch (op)
{
- const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
- const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
- const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
- const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
- const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
- const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
- vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
- vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
- vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
- vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
- break;
+ case ReductionOperation::SUM:
+ case ReductionOperation::MEAN_SUM:
+ {
+ const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+ const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+ const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+ const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+ const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+ const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+ vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+ vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+ vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+ vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+ break;
+ }
+ case ReductionOperation::PROD:
+ {
+ const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset),
+ wrapper::traits::vector_128_tag{});
+ const auto scale32x4f_4 =
+ wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{});
+
+ const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+ const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+ const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+ const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+ const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+ const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+ auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
+ auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
+ auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
+ auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
+
+ //de-quantize vec_elements
+ temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4);
+ temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4);
+ temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4);
+ temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4);
+
+ vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f);
+ vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f);
+ vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f);
+ vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f);
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value,
+ vec_res_idx, op, axis);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value,
+ vec_res_idx, op, axis);
+ vec_res_value = temp_vec_res_value;
+ break;
+ }
+ case ReductionOperation::MIN:
+ {
+ vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+ break;
+ }
+ case ReductionOperation::MAX:
+ {
+ vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
- case ReductionOperation::PROD:
- {
- const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
- const auto scale32x4f_4 = wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{});
-
- const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
- const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
- const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
- const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
- const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
- const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
- auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
- auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
- auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
- auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
+ }
- //de-quantize vec_elements
- temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4);
- temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4);
- temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4);
- temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4);
-
- vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f);
- vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f);
- vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f);
- vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f);
- break;
- }
+ switch (op)
+ {
case ReductionOperation::ARG_IDX_MIN:
- {
- auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
- vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
- vec_res_value = temp_vec_res_value;
- break;
- }
case ReductionOperation::ARG_IDX_MAX:
{
- auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
- vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
- vec_res_value = temp_vec_res_value;
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x), vec_res_idx.val[0]);
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]);
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]);
+ wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 12,
+ vec_res_idx.val[3]);
break;
}
case ReductionOperation::MIN:
- {
- vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
- break;
- }
case ReductionOperation::MAX:
{
- vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+ wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), vec_res_value);
break;
}
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- }
-
- switch(op)
- {
- case ReductionOperation::ARG_IDX_MIN:
- case ReductionOperation::ARG_IDX_MAX:
- {
- wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x), vec_res_idx.val[0]);
- wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]);
- wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]);
- wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 12, vec_res_idx.val[3]);
- break;
- }
- case ReductionOperation::MIN:
- case ReductionOperation::MAX:
- {
- wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), vec_res_value);
- break;
- }
- case ReductionOperation::SUM:
- {
- // Subtract offsets
- auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset);
+ case ReductionOperation::SUM:
+ {
+ // Subtract offsets
+ auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset);
- auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1);
- auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2);
- auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3);
- auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4);
+ auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1);
+ auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2);
+ auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3);
+ auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4);
- vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets);
- vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets);
- vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets);
- vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets);
+ vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets);
+ vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets);
+ vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets);
+ vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets);
- const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2));
- const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4));
+ const auto temp16x8t_1 =
+ wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2));
+ const auto temp16x8t_2 =
+ wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4));
- combine_and_store<T>(temp16x8t_1, temp16x8t_2, output, x);
- break;
- }
- case ReductionOperation::MEAN_SUM:
- {
- vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value1), vec_A);
- vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value2), vec_A);
- vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value3), vec_A);
- vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value4), vec_A);
+ combine_and_store<T>(temp16x8t_1, temp16x8t_2, output, x);
+ break;
+ }
+ case ReductionOperation::MEAN_SUM:
+ {
+ vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value1), vec_A);
+ vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value2), vec_A);
+ vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value3), vec_A);
+ vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value4), vec_A);
#ifdef __aarch64__
- vec_res_value1 = wrapper::vcvta<PromotedType>(vec_res_value1_f);
- vec_res_value2 = wrapper::vcvta<PromotedType>(vec_res_value2_f);
- vec_res_value3 = wrapper::vcvta<PromotedType>(vec_res_value3_f);
- vec_res_value4 = wrapper::vcvta<PromotedType>(vec_res_value4_f);
+ vec_res_value1 = wrapper::vcvta<PromotedType>(vec_res_value1_f);
+ vec_res_value2 = wrapper::vcvta<PromotedType>(vec_res_value2_f);
+ vec_res_value3 = wrapper::vcvta<PromotedType>(vec_res_value3_f);
+ vec_res_value4 = wrapper::vcvta<PromotedType>(vec_res_value4_f);
#else // defined(__aarch64__)
- vec_res_value1 = wrapper::vcvt<PromotedType>(vec_res_value1_f);
- vec_res_value2 = wrapper::vcvt<PromotedType>(vec_res_value2_f);
- vec_res_value3 = wrapper::vcvt<PromotedType>(vec_res_value3_f);
- vec_res_value4 = wrapper::vcvt<PromotedType>(vec_res_value4_f);
+ vec_res_value1 = wrapper::vcvt<PromotedType>(vec_res_value1_f);
+ vec_res_value2 = wrapper::vcvt<PromotedType>(vec_res_value2_f);
+ vec_res_value3 = wrapper::vcvt<PromotedType>(vec_res_value3_f);
+ vec_res_value4 = wrapper::vcvt<PromotedType>(vec_res_value4_f);
#endif // __aarch64__
- const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
- const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
- auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
-
- wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
- break;
- }
- case ReductionOperation::PROD:
- {
- const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
- const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale));
-
- //re-quantize
- vec_res_value1_f = wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4);
- vec_res_value2_f = wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4);
- vec_res_value3_f = wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4);
- vec_res_value4_f = wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4);
-
- vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
- vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
- vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
- vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
+ const auto temp16x8t_1 =
+ wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
+ const auto temp16x8t_2 =
+ wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
+ auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
- const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
- const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
- auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
-
- wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
- break;
+ wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
+ break;
+ }
+ case ReductionOperation::PROD:
+ {
+ const auto offset32x4f_4 =
+ wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
+ const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale));
+
+ //re-quantize
+ vec_res_value1_f =
+ wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4);
+ vec_res_value2_f =
+ wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4);
+ vec_res_value3_f =
+ wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4);
+ vec_res_value4_f =
+ wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4);
+
+ vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
+ vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
+ vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
+ vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
+
+ const auto temp16x8t_1 =
+ wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
+ const auto temp16x8t_2 =
+ wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
+ auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
+
+ wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
- default:
- ARM_COMPUTE_ERROR("Not supported");
}
- }
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- float res_value = 0.f;
- int32_t res_value_q = 0;
-
- switch(op)
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- case ReductionOperation::ARG_IDX_MAX:
- case ReductionOperation::ARG_IDX_MIN:
- case ReductionOperation::MIN:
- case ReductionOperation::MAX:
- {
- res_value = *(input_ptr + x);
- break;
- }
- case ReductionOperation::PROD:
- {
- res_value = static_cast<T>(1.0f);
- break;
- }
- default:
- {
- res_value = static_cast<T>(0.0f);
- break;
- }
- }
- uint32_t res_idx = 0;
+ float res_value = 0.f;
+ int32_t res_value_q = 0;
- for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
- {
- const T *in_ptr = reinterpret_cast<T *>(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim);
- switch(op)
+ switch (op)
{
- case ReductionOperation::SUM:
+ case ReductionOperation::ARG_IDX_MAX:
+ case ReductionOperation::ARG_IDX_MIN:
+ case ReductionOperation::MIN:
+ case ReductionOperation::MAX:
{
- res_value += *in_ptr;
+ res_value = *(input_ptr + x);
break;
}
- case ReductionOperation::MEAN_SUM:
+ case ReductionOperation::PROD:
{
- res_value_q += *in_ptr;
+ res_value = static_cast<T>(1.0f);
break;
}
- case ReductionOperation::SUM_SQUARE:
+ default:
{
- res_value += *in_ptr * *in_ptr;
+ res_value = static_cast<T>(0.0f);
break;
}
- case ReductionOperation::PROD:
+ }
+ uint32_t res_idx = 0;
+
+ for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+ {
+ const T *in_ptr =
+ reinterpret_cast<T *>(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim);
+ switch (op)
{
- //de-quantize input
- if(std::is_same<T, uint8_t>::value)
+ case ReductionOperation::SUM:
{
- res_value *= dequantize_qasymm8(*in_ptr, iq_info);
+ res_value += *in_ptr;
+ break;
}
- else
+ case ReductionOperation::MEAN_SUM:
{
- res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info);
+ res_value_q += *in_ptr;
+ break;
}
- break;
- }
- case ReductionOperation::ARG_IDX_MIN:
- {
- if(*in_ptr < res_value)
+ case ReductionOperation::SUM_SQUARE:
{
- res_value = *in_ptr;
- res_idx = dim;
+ res_value += *in_ptr * *in_ptr;
+ break;
}
- break;
- }
- case ReductionOperation::ARG_IDX_MAX:
- {
- if(*in_ptr > res_value)
+ case ReductionOperation::PROD:
{
- res_value = *in_ptr;
- res_idx = dim;
+ //de-quantize input
+ if (std::is_same<T, uint8_t>::value)
+ {
+ res_value *= dequantize_qasymm8(*in_ptr, iq_info);
+ }
+ else
+ {
+ res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info);
+ }
+ break;
}
- break;
+ case ReductionOperation::ARG_IDX_MIN:
+ {
+ if (*in_ptr < res_value)
+ {
+ res_value = *in_ptr;
+ res_idx = dim;
+ }
+ break;
+ }
+ case ReductionOperation::ARG_IDX_MAX:
+ {
+ if (*in_ptr > res_value)
+ {
+ res_value = *in_ptr;
+ res_idx = dim;
+ }
+ break;
+ }
+ case ReductionOperation::MIN:
+ {
+ res_value = *in_ptr < res_value ? *in_ptr : res_value;
+ break;
+ }
+ case ReductionOperation::MAX:
+ {
+ res_value = *in_ptr > res_value ? *in_ptr : res_value;
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
- case ReductionOperation::MIN:
+ }
+
+ switch (op)
+ {
+ case ReductionOperation::MEAN_SUM:
{
- res_value = *in_ptr < res_value ? *in_ptr : res_value;
+ // Apply previously calculated coefficients (with rounding on aarch64)
+#ifdef __aarch64__
+ const int32_t res =
+ arm_compute::support::cpp11::round(A * (static_cast<float>(res_value_q)) + B);
+#else // defined(__aarch64__)
+ const int32_t res = A * (static_cast<float>(res_value_q)) + B;
+#endif // __aarch64__
+ *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res);
break;
}
- case ReductionOperation::MAX:
+ case ReductionOperation::SUM:
{
- res_value = *in_ptr > res_value ? *in_ptr : res_value;
+ // Subtract accumulated offsets
+ res_value -= (in_info.dimension(axis) - 1) * iq_info.offset;
+ *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res_value);
break;
}
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- }
-
- switch(op)
- {
- case ReductionOperation::MEAN_SUM:
- {
- // Apply previously calculated coefficients (with rounding on aarch64)
-#ifdef __aarch64__
- const int32_t res = arm_compute::support::cpp11::round(A * (static_cast<float>(res_value_q)) + B);
-#else // defined(__aarch64__)
- const int32_t res = A * (static_cast<float>(res_value_q)) + B;
-#endif // __aarch64__
- *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res);
- break;
- }
- case ReductionOperation::SUM:
- {
- // Subtract accumulated offsets
- res_value -= (in_info.dimension(axis) - 1) * iq_info.offset;
- *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res_value);
- break;
- }
- case ReductionOperation::PROD:
- {
- //re-quantize result
- T res = 0;
- if(std::is_same<T, uint8_t>::value)
+ case ReductionOperation::PROD:
{
- res = quantize_qasymm8(res_value, iq_info);
+ //re-quantize result
+ T res = 0;
+ if (std::is_same<T, uint8_t>::value)
+ {
+ res = quantize_qasymm8(res_value, iq_info);
+ }
+ else
+ {
+ res = quantize_qasymm8_signed(res_value, iq_info);
+ }
+ *(reinterpret_cast<T *>(output.ptr() + x)) = res;
+ break;
}
- else
+ case ReductionOperation::ARG_IDX_MIN:
+ case ReductionOperation::ARG_IDX_MAX:
{
- res = quantize_qasymm8_signed(res_value, iq_info);
+ *(reinterpret_cast<uint32_t *>(output.ptr() + x * 4)) = res_idx;
+ break;
}
- *(reinterpret_cast<T *>(output.ptr() + x)) = res;
- break;
- }
- case ReductionOperation::ARG_IDX_MIN:
- case ReductionOperation::ARG_IDX_MAX:
- {
- *(reinterpret_cast<uint32_t *>(output.ptr() + x * 4)) = res_idx;
- break;
+ default:
+ *(reinterpret_cast<T *>(output.ptr() + x)) = res_value;
}
- default:
- *(reinterpret_cast<T *>(output.ptr() + x)) = res_value;
}
- }
- },
- input, output);
+ },
+ input, output);
}
};
-void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op)
+void reduce_op(
+ const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op)
{
const bool is_complex = (input->info()->num_channels() == 2);
- if(is_complex)
+ if (is_complex)
{
- switch(axis)
+ switch (axis)
{
case 2:
- switch(input->info()->data_type())
+ switch (input->info()->data_type())
{
case DataType::F32:
- switch(op)
+ switch (op)
{
case ReductionOperation::SUM:
- return Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(), op);
+ return Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ(
+ window, input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(),
+ op);
default:
ARM_COMPUTE_ERROR("Not supported");
}
@@ -1602,19 +1669,21 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
return;
}
- switch(axis)
+ switch (axis)
{
case 0:
{
- switch(input->info()->data_type())
+ switch (input->info()->data_type())
{
case DataType::QASYMM8:
{
- return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window, input, output, RedOpX_quantized<uint8_t>(), op);
+ return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window, input, output,
+ RedOpX_quantized<uint8_t>(), op);
}
case DataType::QASYMM8_SIGNED:
{
- return Reducer<RedOpX_quantized<int8_t>>::reduceX(window, input, output, RedOpX_quantized<int8_t>(), op);
+ return Reducer<RedOpX_quantized<int8_t>>::reduceX(window, input, output, RedOpX_quantized<int8_t>(),
+ op);
}
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
@@ -1635,19 +1704,22 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
}
}
case 1:
- switch(input->info()->data_type())
+ switch (input->info()->data_type())
{
case DataType::QASYMM8:
{
- return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
+ return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window, input, output,
+ RedOpYZW_quantized<uint8_t>(), op);
}
case DataType::QASYMM8_SIGNED:
{
- return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window, input, output, RedOpYZW_quantized<int8_t>(), op);
+ return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window, input, output,
+ RedOpYZW_quantized<int8_t>(), op);
}
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), op);
+ return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(),
+ op);
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), op);
@@ -1657,15 +1729,18 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
ARM_COMPUTE_ERROR("Not supported");
}
case 2:
- switch(input->info()->data_type())
+ switch (input->info()->data_type())
{
case DataType::QASYMM8:
- return Reducer<RedOpYZW_quantized<uint8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
+ return Reducer<RedOpYZW_quantized<uint8_t>>::reduceZ(window, input, output,
+ RedOpYZW_quantized<uint8_t>(), op);
case DataType::QASYMM8_SIGNED:
- return Reducer<RedOpYZW_quantized<int8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<int8_t>(), op);
+ return Reducer<RedOpYZW_quantized<int8_t>>::reduceZ(window, input, output,
+ RedOpYZW_quantized<int8_t>(), op);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(), op);
+ return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(),
+ op);
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), op);
@@ -1675,15 +1750,18 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
ARM_COMPUTE_ERROR("Not supported");
}
case 3:
- switch(input->info()->data_type())
+ switch (input->info()->data_type())
{
case DataType::QASYMM8:
- return Reducer<RedOpYZW_quantized<uint8_t>>::reduceW(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
+ return Reducer<RedOpYZW_quantized<uint8_t>>::reduceW(window, input, output,
+ RedOpYZW_quantized<uint8_t>(), op);
case DataType::QASYMM8_SIGNED:
- return Reducer<RedOpYZW_quantized<int8_t>>::reduceW(window, input, output, RedOpYZW_quantized<int8_t>(), op);
+ return Reducer<RedOpYZW_quantized<int8_t>>::reduceW(window, input, output,
+ RedOpYZW_quantized<int8_t>(), op);
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(), op);
+ return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(),
+ op);
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), op);
@@ -1704,9 +1782,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
- if(input->num_channels() == 1)
+ if (input->num_channels() == 1)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+ DataType::S32, DataType::F16, DataType::F32);
}
else
{
@@ -1715,13 +1794,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
ARM_COMPUTE_RETURN_ERROR_ON(axis != 2);
}
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+ "Reduction axis greater than max number of dimensions");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN);
- if(!is_arg_min_max)
+ if (!is_arg_min_max)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels());
@@ -1731,8 +1811,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32);
}
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
- const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
+ const TensorShape output_shape =
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
+ const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped);
}
@@ -1745,7 +1826,10 @@ NEReductionOperationKernel::NEReductionOperationKernel()
{
}
-void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
+void NEReductionOperationKernel::configure(const ITensor *input,
+ ITensor *output,
+ unsigned int axis,
+ ReductionOperation op)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -1761,14 +1845,23 @@ void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output
INEKernel::configure(win);
// Calculate output shape and set if empty
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
+ const TensorShape output_shape =
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
// Output auto initialization if not yet initialized
const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
DataType output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type();
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+ auto_init_if_empty(*output->info(), input->info()
+ ->clone()
+ ->set_tensor_shape(output_shape)
+ .set_data_type(output_data_type)
+ .reset_padding()
+ .set_is_resizable(true));
}
-Status NEReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status NEReductionOperationKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ unsigned int axis,
+ ReductionOperation op)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.h b/src/core/NEON/kernels/NEReductionOperationKernel.h
index 08e654fd21..78bec62c14 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.h
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.h
@@ -77,7 +77,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEReorderKernel.cpp b/src/core/NEON/kernels/NEReorderKernel.cpp
index 1a7f58bb08..f92a4c87da 100644
--- a/src/core/NEON/kernels/NEReorderKernel.cpp
+++ b/src/core/NEON/kernels/NEReorderKernel.cpp
@@ -24,11 +24,13 @@
#if defined(__aarch64__)
#include "src/core/NEON/kernels/NEReorderKernel.h"
-#include "src/common/utils/Log.h"
-#include "src/core/NEON/kernels/arm_gemm/transform.hpp"
+
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Validate.h"
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/arm_gemm/transform.hpp"
+
namespace arm_compute
{
@@ -37,29 +39,32 @@ void NEReorderKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- switch(_input->info()->data_type())
+ switch (_input->info()->data_type())
{
case DataType::F32:
{
const int ksize_rows_elements = _xmax * _ksize;
- const int jump_rows = ksize_rows_elements * window.x().start();
- const int k_start = window.x().start() * _ksize;
- const int k_end = std::min(window.x().end() * _ksize, _kmax);
- const int stride = _kmax;
- if(k_start < k_end)
+ const int jump_rows = ksize_rows_elements * window.x().start();
+ const int k_start = window.x().start() * _ksize;
+ const int k_end = std::min(window.x().end() * _ksize, _kmax);
+ const int stride = _kmax;
+ if (k_start < k_end)
{
-
- switch(_output_wf)
+ switch (_output_wf)
{
case WeightFormat::OHWIo4:
{
- arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>(reinterpret_cast<float *>(_output->buffer()) + jump_rows, reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+ arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>(
+ reinterpret_cast<float *>(_output->buffer()) + jump_rows,
+ reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
break;
}
#if defined(ARM_COMPUTE_ENABLE_SVE)
case WeightFormat::OHWIo8:
{
- arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>(reinterpret_cast<float *>(_output->buffer()) + jump_rows, reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+ arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>(
+ reinterpret_cast<float *>(_output->buffer()) + jump_rows,
+ reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
break;
}
#endif /* ARM_COMPUTE_ENABLE_SVE */
@@ -78,11 +83,20 @@ void NEReorderKernel::run(const Window &window, const ThreadInfo &info)
}
NEReorderKernel::NEReorderKernel()
- : _input(nullptr), _output(nullptr), _ksize(0), _kmax(0), _xmax(0), _input_wf(WeightFormat::ANY), _output_wf(WeightFormat::ANY)
+ : _input(nullptr),
+ _output(nullptr),
+ _ksize(0),
+ _kmax(0),
+ _xmax(0),
+ _input_wf(WeightFormat::ANY),
+ _output_wf(WeightFormat::ANY)
{
}
-void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf)
+void NEReorderKernel::configure(const ITensor *input,
+ ITensor *output,
+ arm_compute::WeightFormat input_wf,
+ arm_compute::WeightFormat output_wf)
{
ARM_COMPUTE_LOG_PARAMS(input, output, input_wf, output_wf);
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -96,7 +110,7 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu
// Setting parameters for transform
auto dims = input->info()->num_dimensions();
- switch(dims)
+ switch (dims)
{
case 2:
{
@@ -120,7 +134,7 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu
// Window size is set by rows / _ksize
Window win;
int window_size = 0;
- switch(_output_wf)
+ switch (_output_wf)
{
#if defined(ARM_COMPUTE_ENABLE_SVE)
case WeightFormat::OHWIo8:
@@ -142,7 +156,7 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu
break;
}
}
- if(_kmax % _ksize != 0)
+ if (_kmax % _ksize != 0)
{
window_size += 1;
}
@@ -152,11 +166,14 @@ void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compu
INEKernel::configure(win);
}
-Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf)
+Status NEReorderKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ arm_compute::WeightFormat input_wf,
+ arm_compute::WeightFormat output_wf)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
- if(output->tensor_shape().total_size() != 0)
+ if (output->tensor_shape().total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -167,20 +184,20 @@ Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *ou
int output_x_dim;
int output_k_dim;
auto dims = output->num_dimensions();
- switch(dims)
+ switch (dims)
{
case 2:
{
- input_x_dim = input->dimension(0); // Number of columns in input matrix
- input_k_dim = input->dimension(1); // Number of rows in input matrix
+ input_x_dim = input->dimension(0); // Number of columns in input matrix
+ input_k_dim = input->dimension(1); // Number of rows in input matrix
output_x_dim = output->dimension(0); // Number of columns in output matrix
output_k_dim = output->dimension(1); // Number of rows in output matrix
break;
}
case 4:
{
- input_x_dim = input->dimension(2); // Number of columns in input matrix
- input_k_dim = input->dimension(3); // Number of rows in input matrix
+ input_x_dim = input->dimension(2); // Number of columns in input matrix
+ input_k_dim = input->dimension(3); // Number of rows in input matrix
output_x_dim = output->dimension(2); // Number of columns in output matrix
output_k_dim = output->dimension(3); // Number of rows in output matrix
break;
@@ -192,7 +209,7 @@ Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *ou
}
int ksize;
- switch(output_wf)
+ switch (output_wf)
{
case WeightFormat::OHWIo8:
{
@@ -216,11 +233,10 @@ Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *ou
ARM_COMPUTE_RETURN_ERROR_ON(rnd_up_input_kdim != output_k_dim);
// output x_dim needs to be same as input
ARM_COMPUTE_RETURN_ERROR_ON(input_x_dim != output_x_dim);
-
}
return Status{};
}
} // namespace arm_compute
-#endif // defined(__aarch64__) \ No newline at end of file
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/NEReorderKernel.h b/src/core/NEON/kernels/NEReorderKernel.h
index 07908890f4..4528b25245 100644
--- a/src/core/NEON/kernels/NEReorderKernel.h
+++ b/src/core/NEON/kernels/NEReorderKernel.h
@@ -26,9 +26,10 @@
#ifndef ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL
#define ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL
-#include "src/core/NEON/INEKernel.h"
#include "arm_compute/core/Types.h"
+#include "src/core/NEON/INEKernel.h"
+
namespace arm_compute
{
@@ -36,7 +37,6 @@ namespace arm_compute
class NEReorderKernel : public INEKernel
{
public:
-
const char *name() const override
{
return "NEReorderKernel";
@@ -62,7 +62,10 @@ public:
* @param[in] input_wf WeightFormat of input.
* @param[in] output_wf WeightFormat of output.
*/
- void configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf);
+ void configure(const ITensor *input,
+ ITensor *output,
+ arm_compute::WeightFormat input_wf,
+ arm_compute::WeightFormat output_wf);
/** Static function to check if given info will lead to a valid configuration of @ref NEReorderKernel
*
@@ -73,25 +76,27 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ arm_compute::WeightFormat input_wf,
+ arm_compute::WeightFormat output_wf);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
-
-/*****************************************************************************/
+ /*****************************************************************************/
private:
- const ITensor *_input{nullptr}; // Input tensor
- ITensor *_output{nullptr}; // Output tensor
- int32_t _ksize{0}; // Blocking parameter, how many rows kernel reorders on each call
- int32_t _kmax{0}; // Rows in input tensor
- int32_t _xmax{0}; // Columns in input tensor
- WeightFormat _input_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of input tensor
- WeightFormat _output_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of output tensor
+ const ITensor *_input{nullptr}; // Input tensor
+ ITensor *_output{nullptr}; // Output tensor
+ int32_t _ksize{0}; // Blocking parameter, how many rows kernel reorders on each call
+ int32_t _kmax{0}; // Rows in input tensor
+ int32_t _xmax{0}; // Columns in input tensor
+ WeightFormat _input_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of input tensor
+ WeightFormat _output_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of output tensor
};
} // namespace arm_compute
#endif /* ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL */
-#endif // defined(__aarch64__) \ No newline at end of file
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.cpp b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
index a7b830c066..227570405c 100644
--- a/src/core/NEON/kernels/NEReorgLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
@@ -28,8 +28,9 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -50,13 +51,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0,
+ "The width of the input tensor must be a multiple of stride");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0,
+ "The height of the input tensor must be a multiple of stride");
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
+ const TensorInfo tensor_info_output =
+ output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -65,8 +69,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
}
} // namespace
-NEReorgLayerKernel::NEReorgLayerKernel()
- : _input(nullptr), _output(nullptr), _stride(1)
+NEReorgLayerKernel::NEReorgLayerKernel() : _input(nullptr), _output(nullptr), _stride(1)
{
}
@@ -121,23 +124,26 @@ void NEReorgLayerKernel::run(const Window &window, const ThreadInfo &info)
Iterator out(_output, collapsed_window);
// Perform reorg
- execute_window_loop(collapsed_window, [&](const Coordinates & id)
- {
- // Get spatial coords and channels
- const unsigned int w = id[idx_w];
- const unsigned int h = id[idx_h];
- const unsigned int c = id[idx_c];
-
- // Calculate mapping
- const unsigned int offset = c / out_c;
- Coordinates map_coords = id;
- map_coords.set(idx_w, w * stride + offset % stride);
- map_coords.set(idx_h, h * stride + offset / stride);
- map_coords.set(idx_c, c % out_c);
-
- // Perform mapping
- std::memcpy(out.ptr(), in_ptr + _input->info()->offset_element_in_bytes(map_coords), _input->info()->element_size());
- },
- out);
+ execute_window_loop(
+ collapsed_window,
+ [&](const Coordinates &id)
+ {
+ // Get spatial coords and channels
+ const unsigned int w = id[idx_w];
+ const unsigned int h = id[idx_h];
+ const unsigned int c = id[idx_c];
+
+ // Calculate mapping
+ const unsigned int offset = c / out_c;
+ Coordinates map_coords = id;
+ map_coords.set(idx_w, w * stride + offset % stride);
+ map_coords.set(idx_h, h * stride + offset / stride);
+ map_coords.set(idx_c, c % out_c);
+
+ // Perform mapping
+ std::memcpy(out.ptr(), in_ptr + _input->info()->offset_element_in_bytes(map_coords),
+ _input->info()->element_size());
+ },
+ out);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp
index ca6c117882..d2437eecd0 100644
--- a/src/core/NEON/kernels/NEReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEReverseKernel.cpp
@@ -26,15 +26,17 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis)
{
ARM_COMPUTE_UNUSED(use_inverted_axis);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, axis);
@@ -42,11 +44,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32, DataType::S32);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Current implementation only supports up to 4 dimensions.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4,
+ "Current implementation only supports up to 4 dimensions.");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed");
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -57,8 +60,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
}
} // namespace
-NEReverseKernel::NEReverseKernel()
- : _input(nullptr), _output(nullptr), _axis(nullptr), _use_inverted_axis(false)
+NEReverseKernel::NEReverseKernel() : _input(nullptr), _output(nullptr), _axis(nullptr), _use_inverted_axis(false)
{
}
@@ -80,7 +82,10 @@ void NEReverseKernel::configure(const ITensor *input, ITensor *output, const ITe
INEKernel::configure(calculate_max_window(*output->info()));
}
-Status NEReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis)
+Status NEReverseKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *axis,
+ bool use_inverted_axis)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, use_inverted_axis));
@@ -88,29 +93,30 @@ Status NEReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *ou
}
template <typename T>
-void run_reverse(const Window &window, const ITensor *input, const ITensor *axis, ITensor *output, bool use_inverted_axis)
+void run_reverse(
+ const Window &window, const ITensor *input, const ITensor *axis, ITensor *output, bool use_inverted_axis)
{
unsigned int axis_bit = 0;
const int rank = input->info()->num_dimensions();
- for(unsigned int i = 0; i < axis->info()->dimension(0); ++i)
+ for (unsigned int i = 0; i < axis->info()->dimension(0); ++i)
{
int axis_i = *(reinterpret_cast<const int *>(axis->buffer()) + i);
// The values of axis tensor must be between [-rank, rank-1].
- if((axis_i < -rank) || (axis_i >= rank))
+ if ((axis_i < -rank) || (axis_i >= rank))
{
ARM_COMPUTE_ERROR("the valuses of the axis tensor must be within [-rank, rank-1].");
}
// In case of negative axis value i.e targeted axis(i) = rank + axis(i)
- if(axis_i < 0)
+ if (axis_i < 0)
{
axis_i = rank + axis_i;
}
// Reverse ACL axis indices convention i.e. (inverted)axis = (tensor_rank - 1) - axis
- if(use_inverted_axis)
+ if (use_inverted_axis)
{
axis_i = (rank - 1) - axis_i;
}
@@ -127,43 +133,47 @@ void run_reverse(const Window &window, const ITensor *input, const ITensor *axis
win.set(Window::DimX, Window::Dimension(0, 1, 1));
Iterator input_it(input, win);
- execute_window_loop(win, [&](const Coordinates & id)
- {
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win,
+ [&](const Coordinates &id)
{
- auto in = wrapper::vloadq(reinterpret_cast<T *>(input_it.ptr()) + x);
-
- // Reverse 0 axis
- if(axis_bit & 0x1)
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- in = wrapper::vrev64(in);
- in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
+ auto in = wrapper::vloadq(reinterpret_cast<T *>(input_it.ptr()) + x);
+
+ // Reverse 0 axis
+ if (axis_bit & 0x1)
+ {
+ in = wrapper::vrev64(in);
+ in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
+ }
+
+ const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - window_step_x : x;
+ const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
+ const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
+ const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
+
+ auto out_ptr =
+ reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w)));
+ wrapper::vstore(out_ptr, in);
}
- const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - window_step_x : x;
- const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
- const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
- const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
-
- auto out_ptr = reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w)));
- wrapper::vstore(out_ptr, in);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- const auto in = *(reinterpret_cast<T *>(input_it.ptr()) + x);
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
+ {
+ const auto in = *(reinterpret_cast<T *>(input_it.ptr()) + x);
- const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - 1 : x;
- const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
- const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
- const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
+ const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - 1 : x;
+ const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
+ const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
+ const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
- *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))) = in;
- }
- },
- input_it);
+ *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))) =
+ in;
+ }
+ },
+ input_it);
}
void NEReverseKernel::run(const Window &window, const ThreadInfo &info)
@@ -172,7 +182,7 @@ void NEReverseKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- switch(_input->info()->element_size())
+ switch (_input->info()->element_size())
{
case 4:
run_reverse<uint32_t>(window, _input, _axis, _output, _use_inverted_axis);
diff --git a/src/core/NEON/kernels/NEReverseKernel.h b/src/core/NEON/kernels/NEReverseKernel.h
index 7d9ec4691c..92261887f4 100644
--- a/src/core/NEON/kernels/NEReverseKernel.h
+++ b/src/core/NEON/kernels/NEReverseKernel.h
@@ -68,7 +68,8 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis);
+ static Status
+ validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp
index b8c9b244ee..7789b828ea 100644
--- a/src/core/NEON/kernels/NESelectKernel.cpp
+++ b/src/core/NEON/kernels/NESelectKernel.cpp
@@ -29,13 +29,12 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
-
+#include "src/core/NEON/wrapper/wrapper.h"
#include "src/cpu/kernels/select/list.h"
#include <arm_neon.h>
@@ -54,7 +53,8 @@ struct SelectKernelSelectorData
};
using SelectorPtr = std::add_pointer<bool(const SelectKernelSelectorData &data)>::type;
-using KernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
+using KernelPtr =
+ std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
struct SelectKernelSelector
{
@@ -63,95 +63,62 @@ struct SelectKernelSelector
KernelPtr ukernel;
};
-static const SelectKernelSelector available_kernels[] =
-{
- {
- "neon_s8_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == true; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_same_rank)
- },
- {
- "neon_s16_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == true; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_same_rank)
- },
- {
- "neon_s32_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == true; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_same_rank)
- },
- {
- "neon_u8_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == true; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_same_rank)
- },
- {
- "neon_u16_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == true; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_same_rank)
- },
- {
- "neon_u32_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == true; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_same_rank)
- },
- {
- "neon_s8_not_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == false; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_not_same_rank)
- },
- {
- "neon_s16_not_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == false; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_not_same_rank)
- },
- {
- "neon_s32_not_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == false; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_not_same_rank)
- },
- {
- "neon_u8_not_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == false; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_not_same_rank)
- },
- {
- "neon_u16_not_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == false; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_not_same_rank)
- },
- {
- "neon_u32_not_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == false; },
- REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_not_same_rank)
- },
- {
- "neon_f16_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == true; },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_same_rank)
- },
- {
- "neon_f16_not_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == false; },
- REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_not_same_rank)
- },
- {
- "neon_f32_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == true; },
- REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_same_rank)
- },
- {
- "neon_f32_not_same_rank",
- [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == false; },
- REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_not_same_rank)
- },
+static const SelectKernelSelector available_kernels[] = {
+ {"neon_s8_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::S8 && data.is_same_rank == true; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_same_rank)},
+ {"neon_s16_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::S16 && data.is_same_rank == true; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_same_rank)},
+ {"neon_s32_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::S32 && data.is_same_rank == true; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_same_rank)},
+ {"neon_u8_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::U8 && data.is_same_rank == true; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_same_rank)},
+ {"neon_u16_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::U16 && data.is_same_rank == true; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_same_rank)},
+ {"neon_u32_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::U32 && data.is_same_rank == true; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_same_rank)},
+ {"neon_s8_not_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::S8 && data.is_same_rank == false; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_not_same_rank)},
+ {"neon_s16_not_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::S16 && data.is_same_rank == false; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_not_same_rank)},
+ {"neon_s32_not_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::S32 && data.is_same_rank == false; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_not_same_rank)},
+ {"neon_u8_not_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::U8 && data.is_same_rank == false; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_not_same_rank)},
+ {"neon_u16_not_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::U16 && data.is_same_rank == false; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_not_same_rank)},
+ {"neon_u32_not_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::U32 && data.is_same_rank == false; },
+ REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_not_same_rank)},
+ {"neon_f16_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::F16 && data.is_same_rank == true; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_same_rank)},
+ {"neon_f16_not_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::F16 && data.is_same_rank == false; },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_not_same_rank)},
+ {"neon_f32_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::F32 && data.is_same_rank == true; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_same_rank)},
+ {"neon_f32_not_same_rank",
+ [](const SelectKernelSelectorData &data) { return data.dt == DataType::F32 && data.is_same_rank == false; },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_not_same_rank)},
};
const SelectKernelSelector *get_implementation(const SelectKernelSelectorData &data)
{
- for(const auto &uk : available_kernels)
+ for (const auto &uk : available_kernels)
{
- if(uk.is_selected(data))
+ if (uk.is_selected(data))
{
return &uk;
}
@@ -184,7 +151,8 @@ void NESelectKernel::configure(const ITensor *c, const ITensor *x, const ITensor
INEKernel::configure(win);
}
-Status NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+Status
+NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(c, x, y);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(x);
@@ -195,9 +163,11 @@ Status NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, cons
const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape()));
- ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
+ ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank &&
+ ((c->tensor_shape().num_dimensions() > 1) ||
+ (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
- if(output != nullptr && output->total_size() != 0)
+ if (output != nullptr && output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output);
@@ -214,7 +184,7 @@ void NESelectKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_ERROR_ON(_output == nullptr);
ARM_COMPUTE_ERROR_ON(_output->info() == nullptr);
- const auto *uk = get_implementation(SelectKernelSelectorData{ _output->info()->data_type(), _has_same_rank });
+ const auto *uk = get_implementation(SelectKernelSelectorData{_output->info()->data_type(), _has_same_rank});
ARM_COMPUTE_ERROR_ON(uk == nullptr);
ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
uk->ukernel(_c, _x, _y, _output, window);
diff --git a/src/core/NEON/kernels/NESelectKernel.h b/src/core/NEON/kernels/NESelectKernel.h
index e82105a68e..4fec42b536 100644
--- a/src/core/NEON/kernels/NESelectKernel.h
+++ b/src/core/NEON/kernels/NESelectKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NESELECTKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
@@ -82,7 +83,6 @@ public:
void run(const Window &window, const ThreadInfo &info) override;
private:
-
const ITensor *_c; /**< Condition tensor */
const ITensor *_x; /**< Source tensor 1 */
const ITensor *_y; /**< Source tensor 2 */
diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
index 673eace3c1..da023aeb96 100644
--- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
@@ -26,11 +26,12 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
#include <cstdint>
@@ -41,19 +42,22 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *paddings, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *block_info,
+ const ITensorInfo *paddings,
+ const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, paddings, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{ 2 });
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{2});
ARM_COMPUTE_RETURN_ERROR_ON(paddings->num_dimensions() > 2);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{ 2, 2 });
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{2, 2});
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const DataLayout data_layout = input->data_layout();
const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
@@ -64,7 +68,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
return Status{};
}
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status validate_arguments_static(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -73,9 +81,10 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(input, block_shape_x, block_shape_y, padding_left, padding_right);
+ TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+ input, block_shape_x, block_shape_y, padding_left, padding_right);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), expected_output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -86,14 +95,25 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
} // namespace
NESpaceToBatchLayerKernel::NESpaceToBatchLayerKernel()
- : _input(nullptr), _block_shape(nullptr), _paddings(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _padding_left(), _block_shape_x(), _block_shape_y()
+ : _input(nullptr),
+ _block_shape(nullptr),
+ _paddings(nullptr),
+ _output(nullptr),
+ _data_layout(DataLayout::UNKNOWN),
+ _padding_left(),
+ _block_shape_x(),
+ _block_shape_y()
{
}
-void NESpaceToBatchLayerKernel::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output)
+void NESpaceToBatchLayerKernel::configure(const ITensor *input,
+ const ITensor *block_shape,
+ const ITensor *paddings,
+ ITensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
_input = input;
_block_shape = block_shape;
@@ -106,15 +126,22 @@ void NESpaceToBatchLayerKernel::configure(const ITensor *input, const ITensor *b
ICPPKernel::configure(win);
}
-void NESpaceToBatchLayerKernel::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
- ITensor *output)
+void NESpaceToBatchLayerKernel::configure(const ITensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ITensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+ TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+ input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+ input->info()->quantization_info());
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info()));
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left,
+ padding_right, output->info()));
_input = input;
_output = output;
@@ -128,15 +155,23 @@ void NESpaceToBatchLayerKernel::configure(const ITensor *input, const int block_
INEKernel::configure(win);
}
-Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *block_shape,
+ const ITensorInfo *paddings,
+ const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output));
return Status{};
}
-Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
return Status{};
}
@@ -146,17 +181,17 @@ void NESpaceToBatchLayerKernel::run(const Window &window, const ThreadInfo &info
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
- if(_block_shape != nullptr)
+ if (_block_shape != nullptr)
{
// Retrieve the block shapes dynamically
_block_shape_x = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(0)));
_block_shape_y = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(1)));
}
- if(_paddings != nullptr)
+ if (_paddings != nullptr)
{
- const size_t pad_left_x = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({ 0, 0 }));
- const size_t pad_left_y = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({ 1, 0 }));
+ const size_t pad_left_x = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({0, 0}));
+ const size_t pad_left_y = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({1, 0}));
_padding_left = Size2D(pad_left_x, pad_left_y);
}
const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
@@ -173,57 +208,61 @@ void NESpaceToBatchLayerKernel::run(const Window &window, const ThreadInfo &info
int batch_id = 0;
// Main loop for NCHW and NHWC
- if(_data_layout == DataLayout::NCHW)
+ if (_data_layout == DataLayout::NCHW)
{
do
{
Iterator out(_output, slice_out);
- execute_window_loop(slice_out, [&](const Coordinates & id)
- {
- const size_t out_x = id.x();
- const size_t out_y = id.y();
- const size_t z = id.z();
- const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
- const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
- if(pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+ execute_window_loop(
+ slice_out,
+ [&](const Coordinates &id)
{
- const int w = batch_id % batch_size;
- const int in_x = pos_x - _padding_left.x();
- const int in_y = pos_y - _padding_left.y();
- Coordinates input_coords{ in_x, in_y, z, w };
- memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
- }
- },
- out);
+ const size_t out_x = id.x();
+ const size_t out_y = id.y();
+ const size_t z = id.z();
+ const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
+ const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
+ if (pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height &&
+ pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+ {
+ const int w = batch_id % batch_size;
+ const int in_x = pos_x - _padding_left.x();
+ const int in_y = pos_y - _padding_left.y();
+ Coordinates input_coords{in_x, in_y, z, w};
+ memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+ }
+ },
+ out);
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_out));
+ } while (window.slide_window_slice_3D(slice_out));
}
else
{
do
{
Iterator out(_output, slice_out);
- execute_window_loop(slice_out, [&](const Coordinates & id)
- {
- const size_t out_x = id.y();
- const size_t out_y = id.z();
- const size_t z = id.x();
- const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
- const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
- if(pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+ execute_window_loop(
+ slice_out,
+ [&](const Coordinates &id)
{
- const int w = batch_id % batch_size;
- const int in_x = pos_x - _padding_left.x();
- const int in_y = pos_y - _padding_left.y();
- Coordinates input_coords{ z, in_x, in_y, w };
- memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
- }
- },
- out);
+ const size_t out_x = id.y();
+ const size_t out_y = id.z();
+ const size_t z = id.x();
+ const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
+ const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
+ if (pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height &&
+ pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+ {
+ const int w = batch_id % batch_size;
+ const int in_x = pos_x - _padding_left.x();
+ const int in_y = pos_y - _padding_left.y();
+ Coordinates input_coords{z, in_x, in_y, w};
+ memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+ }
+ },
+ out);
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_out));
+ } while (window.slide_window_slice_3D(slice_out));
}
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h
index 44b8cbb514..6292c07136 100644
--- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h
+++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NESPACETOBATCHLAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
@@ -69,7 +70,12 @@ public:
* @param[in] padding_right The padding at the end of every dimension of the output tensor.
* @param[out] output Tensor output. Data types supported: same as @p input
*/
- void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
+ void configure(const ITensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ITensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel
*
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -79,7 +85,10 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *block_shape,
+ const ITensorInfo *paddings,
+ const ITensorInfo *output);
/** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel (Static block shape and paddings)
*
* @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -91,7 +100,12 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ const ITensorInfo *output);
// Inherited methods overridden:
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
index 7687c50c40..b49c5ee344 100644
--- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
@@ -26,11 +26,12 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
#include <cstdint>
@@ -50,7 +51,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const DataLayout data_layout = input->data_layout();
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -115,43 +116,45 @@ void NESpaceToDepthLayerKernel::run(const Window &window, const ThreadInfo &info
int batch_id = 0;
// Main loop for NCHW and NHWC
- if(_data_layout == DataLayout::NCHW)
+ if (_data_layout == DataLayout::NCHW)
{
do
{
Iterator out(_output, slice_out);
- execute_window_loop(slice_out, [&](const Coordinates & id)
- {
- const size_t channel_id = id.z();
- const size_t in_x = id.x() * _block_shape + (channel_id / channel_size) % _block_shape;
- const size_t in_y = id.y() * _block_shape + (channel_id / channel_size) / _block_shape;
- const int z = channel_id % channel_size;
- Coordinates input_coords{ in_x, in_y, z, batch_id };
- memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
- },
- out);
+ execute_window_loop(
+ slice_out,
+ [&](const Coordinates &id)
+ {
+ const size_t channel_id = id.z();
+ const size_t in_x = id.x() * _block_shape + (channel_id / channel_size) % _block_shape;
+ const size_t in_y = id.y() * _block_shape + (channel_id / channel_size) / _block_shape;
+ const int z = channel_id % channel_size;
+ Coordinates input_coords{in_x, in_y, z, batch_id};
+ memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+ },
+ out);
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_out));
+ } while (window.slide_window_slice_3D(slice_out));
}
else
{
do
{
Iterator out(_output, slice_out);
- execute_window_loop(slice_out, [&](const Coordinates & id)
- {
- const size_t channel_id = id.x();
- const size_t in_x = id.y() * _block_shape + (channel_id / channel_size) % _block_shape;
- const size_t in_y = id.z() * _block_shape + (channel_id / channel_size) / _block_shape;
- const int z = channel_id % channel_size;
- Coordinates input_coords{ z, in_x, in_y, batch_id };
- memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
- },
- out);
+ execute_window_loop(
+ slice_out,
+ [&](const Coordinates &id)
+ {
+ const size_t channel_id = id.x();
+ const size_t in_x = id.y() * _block_shape + (channel_id / channel_size) % _block_shape;
+ const size_t in_y = id.z() * _block_shape + (channel_id / channel_size) / _block_shape;
+ const int z = channel_id % channel_size;
+ Coordinates input_coords{z, in_x, in_y, batch_id};
+ memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+ },
+ out);
++batch_id;
- }
- while(window.slide_window_slice_3D(slice_out));
+ } while (window.slide_window_slice_3D(slice_out));
}
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h
index 953b68a401..7d147c5b94 100644
--- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.cpp b/src/core/NEON/kernels/NEStackLayerKernel.cpp
index 93080e2ac7..e23b40a9aa 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEStackLayerKernel.cpp
@@ -25,13 +25,13 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -41,7 +41,11 @@ using namespace arm_compute::misc::shape_calculator;
namespace
{
-Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+ unsigned int axis,
+ unsigned int idx_input,
+ unsigned int num_tensors,
+ const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
// Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
@@ -50,9 +54,10 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+ compute_stack_shape(*input, axis, num_tensors));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
}
@@ -60,7 +65,8 @@ Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
{
// Output auto inizialitation if not yet initialized
auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors)));
@@ -71,11 +77,12 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsi
return std::make_pair(Status{}, win);
}
-inline Coordinates shift_from_axis_and_replace_coordinate(const Coordinates &id, unsigned int axis, unsigned int idx_input)
+inline Coordinates
+shift_from_axis_and_replace_coordinate(const Coordinates &id, unsigned int axis, unsigned int idx_input)
{
constexpr int max_out_coord = 5; // Input shape is max a 4D shape, output is max 5D
Coordinates id_out = id;
- for(unsigned int i = max_out_coord - 1; i > axis; --i)
+ for (unsigned int i = max_out_coord - 1; i > axis; --i)
{
id_out.set(i, id[i - 1]);
}
@@ -84,12 +91,12 @@ inline Coordinates shift_from_axis_and_replace_coordinate(const Coordinates &id,
}
} // namespace
-NEStackLayerKernel::NEStackLayerKernel()
- : _input(nullptr), _output(nullptr), _axis(), _idx_input()
+NEStackLayerKernel::NEStackLayerKernel() : _input(nullptr), _output(nullptr), _axis(), _idx_input()
{
}
-void NEStackLayerKernel::configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output)
+void NEStackLayerKernel::configure(
+ const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info()));
@@ -106,10 +113,15 @@ void NEStackLayerKernel::configure(const ITensor *input, unsigned int axis, unsi
INEKernel::configure(win_config.second);
}
-Status NEStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status NEStackLayerKernel::validate(const ITensorInfo *input,
+ unsigned int axis,
+ unsigned int idx_input,
+ unsigned int num_tensors,
+ const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
return Status{};
}
@@ -131,12 +143,15 @@ void NEStackLayerKernel::run(const Window &window, const ThreadInfo &info)
const int stride_w = _output->info()->num_dimensions() >= 3 ? _output->info()->strides_in_bytes()[3] : 0;
const int stride_k = _output->info()->num_dimensions() >= 4 ? _output->info()->strides_in_bytes()[4] : 0;
- execute_window_loop(window, [&](const Coordinates & id)
- {
- Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input);
- const int idx = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w + id_out[4] * stride_k;
- std::memcpy(output.ptr() + idx, input.ptr(), _input->info()->element_size());
- },
- input);
+ execute_window_loop(
+ window,
+ [&](const Coordinates &id)
+ {
+ Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input);
+ const int idx = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w +
+ id_out[4] * stride_k;
+ std::memcpy(output.ptr() + idx, input.ptr(), _input->info()->element_size());
+ },
+ input);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEStackLayerKernel.h b/src/core/NEON/kernels/NEStackLayerKernel.h
index 9b36518e4d..685812b56d 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.h
+++ b/src/core/NEON/kernels/NEStackLayerKernel.h
@@ -26,6 +26,7 @@
#define ARM_COMPUTE_NESTACKLAYERKERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/INEKernel.h"
namespace arm_compute
@@ -64,7 +65,8 @@ public:
* @param[out] output Output tensor. Data types supported: Same as @p input.
*
*/
- void configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output);
+ void configure(
+ const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref NEStackLayerKernel
*
* @note Supported input tensor rank: up to 4
@@ -78,7 +80,11 @@ public:
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *input,
+ unsigned int axis,
+ unsigned int idx_input,
+ unsigned int num_tensors,
+ const ITensorInfo *output);
// Inherited methods overridden
void run(const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.cpp b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
index 2b406a8b8b..efff51be9d 100644
--- a/src/core/NEON/kernels/NEStridedSliceKernel.cpp
+++ b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
@@ -26,9 +26,10 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
#include "arm_compute/core/utils/helpers/tensor_transform.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Window.h"
+
#include "src/core/CPP/Validate.h"
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -38,9 +39,14 @@ namespace arm_compute
{
namespace
{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status validate_arguments(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
@@ -49,19 +55,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions());
- ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i)
- {
- return i == 0;
- }));
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) { return i == 0; }));
// Get expected output shape
- const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
- starts, ends, strides,
- begin_mask, end_mask, shrink_axis_mask);
+ const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+ *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0);
// Checks output if configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info);
@@ -71,14 +74,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
// Output tensor auto initialization if not yet initialized
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
- starts, ends, strides,
- begin_mask, end_mask, shrink_axis_mask);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+ *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
// Create window
@@ -88,38 +95,49 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input
}
} // namespace
-NEStridedSliceKernel::NEStridedSliceKernel()
- : _starts_abs(), _final_strides(), _shrink_mask()
+NEStridedSliceKernel::NEStridedSliceKernel() : _starts_abs(), _final_strides(), _shrink_mask()
{
}
-void NEStridedSliceKernel::configure(const ITensorInfo *input, ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void NEStridedSliceKernel::configure(const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+ ARM_COMPUTE_ERROR_THROW_ON(
+ validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
_shrink_mask = shrink_axis_mask;
const TensorShape &input_shape = input->tensor_shape();
Coordinates ends_abs;
- std::tie(_starts_abs, ends_abs, _final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(
- input_shape,
- starts, ends, strides,
- begin_mask, end_mask, shrink_axis_mask);
+ std::tie(_starts_abs, ends_abs, _final_strides) =
+ arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(input_shape, starts, ends, strides,
+ begin_mask, end_mask, shrink_axis_mask);
// Configure kernel window
- auto win_config = validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+ auto win_config =
+ validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
INEKernel::configure(win_config.second);
}
-Status NEStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status NEStridedSliceKernel::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
- starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)
- .first);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), starts, ends,
+ strides, begin_mask, end_mask, shrink_axis_mask)
+ .first);
return Status{};
}
@@ -156,7 +174,7 @@ void NEStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, co
size_t length_x = win.shape()[0];
- if(_final_strides[0] == 1 && !is_shrink_x)
+ if (_final_strides[0] == 1 && !is_shrink_x)
{
win.set(Window::DimX, Window::Dimension(0, 1, 1));
width_size = width_size * length_x;
@@ -183,16 +201,17 @@ void NEStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, co
uint8_t *cur_ptr;
execute_window_loop(
- win, [&](const Coordinates & id)
- {
- cur_ptr = input_base;
- cur_ptr += (start_0 + (id[idx_x] * shrinked_stride_0)) * byte_increment_0;
- cur_ptr += (start_1 + (id[idx_y] * shrinked_stride_1)) * byte_increment_1;
- cur_ptr += (start_2 + (id[idx_z] * shrinked_stride_2)) * byte_increment_2;
- cur_ptr += (start_3 + (id[idx_w] * shrinked_stride_3)) * byte_increment_3;
-
- std::copy_n(cur_ptr, width_size, output_it.ptr());
- },
- output_it);
+ win,
+ [&](const Coordinates &id)
+ {
+ cur_ptr = input_base;
+ cur_ptr += (start_0 + (id[idx_x] * shrinked_stride_0)) * byte_increment_0;
+ cur_ptr += (start_1 + (id[idx_y] * shrinked_stride_1)) * byte_increment_1;
+ cur_ptr += (start_2 + (id[idx_z] * shrinked_stride_2)) * byte_increment_2;
+ cur_ptr += (start_3 + (id[idx_w] * shrinked_stride_3)) * byte_increment_3;
+
+ std::copy_n(cur_ptr, width_size, output_it.ptr());
+ },
+ output_it);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.h b/src/core/NEON/kernels/NEStridedSliceKernel.h
index 9ce517417d..a475f09a17 100644
--- a/src/core/NEON/kernels/NEStridedSliceKernel.h
+++ b/src/core/NEON/kernels/NEStridedSliceKernel.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_NE_STRIDED_SLICE_KERNEL_H
#include "arm_compute/core/Types.h"
+
#include "src/core/NEON/INEKernel.h"
#include <cstdint>
@@ -68,9 +69,14 @@ public:
* @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
* A slice of size 1 starting from starts[i] in the dimension must be preserved.
*/
- void configure(const ITensorInfo *input, ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+ void configure(const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask);
/** Static function to check if given info will lead to a valid configuration of @ref NEStridedSliceKernel
*
@@ -86,9 +92,14 @@ public:
* @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
* A slice of size 1 starting from starts[i] in the dimension must be preserved.
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+ static Status validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask);
// Inherited methods overridden:
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
diff --git a/src/core/NEON/kernels/NETileKernel.cpp b/src/core/NEON/kernels/NETileKernel.cpp
index 94256dc12d..577ce5b69e 100644
--- a/src/core/NEON/kernels/NETileKernel.cpp
+++ b/src/core/NEON/kernels/NETileKernel.cpp
@@ -27,9 +27,10 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/helpers/WindowHelpers.h"
@@ -43,15 +44,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4);
ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty());
- ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e)
- {
- return e == 0;
- }));
+ ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) { return e == 0; }));
// Validate output if initialized
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+ misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
@@ -59,8 +58,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
}
} // namespace
-NETileKernel::NETileKernel()
- : _input(nullptr), _output(nullptr)
+NETileKernel::NETileKernel() : _input(nullptr), _output(nullptr)
{
}
@@ -95,8 +93,9 @@ void NETileKernel::run(const Window &window, const ThreadInfo &info)
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- Window output_window{ window };
- output_window.set(Window::DimX, Window::Dimension(output_window.x().start(), output_window.x().end(), _input->info()->dimension(0)));
+ Window output_window{window};
+ output_window.set(Window::DimX, Window::Dimension(output_window.x().start(), output_window.x().end(),
+ _input->info()->dimension(0)));
Window out_slice = output_window.first_slice_window_1D();
const auto src_shape = _input->info()->tensor_shape();
@@ -104,17 +103,19 @@ void NETileKernel::run(const Window &window, const ThreadInfo &info)
{
Iterator output_it(_output, out_slice);
- execute_window_loop(out_slice, [&](const Coordinates & id)
- {
- const size_t x = id.x();
- const size_t y = id.y();
- const size_t z = id.z();
- const size_t w = id[3];
- Coordinates input_coords{ x % src_shape[0], y % src_shape[1], z % src_shape[2], w % src_shape[3] };
- memcpy(output_it.ptr(), _input->ptr_to_element(input_coords), _input->info()->dimension(0) * _input->info()->element_size());
- },
- output_it);
- }
- while(output_window.slide_window_slice_1D(out_slice));
+ execute_window_loop(
+ out_slice,
+ [&](const Coordinates &id)
+ {
+ const size_t x = id.x();
+ const size_t y = id.y();
+ const size_t z = id.z();
+ const size_t w = id[3];
+ Coordinates input_coords{x % src_shape[0], y % src_shape[1], z % src_shape[2], w % src_shape[3]};
+ memcpy(output_it.ptr(), _input->ptr_to_element(input_coords),
+ _input->info()->dimension(0) * _input->info()->element_size());
+ },
+ output_it);
+ } while (output_window.slide_window_slice_1D(out_slice));
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp
index dbd47ccfa9..13c2d314e4 100644
--- a/src/core/NEON/kernels/assembly/depthwise.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise.hpp
@@ -38,9 +38,8 @@ struct DepthwiseConfig
DepthwiseMethod method = DepthwiseMethod::DEFAULT;
std::string filter = "";
- DepthwiseConfig(DepthwiseMethod method)
- : method(method) {};
- DepthwiseConfig() {};
+ DepthwiseConfig(DepthwiseMethod method) : method(method){};
+ DepthwiseConfig(){};
};
struct DepthwiseArgs
@@ -63,18 +62,24 @@ struct DepthwiseArgs
bool fast_mode = false;
- DepthwiseArgs(
- const CPUInfo *cpu_info,
- unsigned int kernel_rows, unsigned int kernel_cols,
- unsigned int stride_rows, unsigned int stride_cols,
- unsigned int dilation_rows, unsigned int dilation_cols,
- unsigned int n_batches, unsigned int input_rows, unsigned int input_cols,
- unsigned int input_channels,
- unsigned int output_rows, unsigned int output_cols,
- unsigned int channel_multiplier,
- PaddingValues padding, arm_gemm::Activation activation,
-
- const DepthwiseConfig *config)
+ DepthwiseArgs(const CPUInfo *cpu_info,
+ unsigned int kernel_rows,
+ unsigned int kernel_cols,
+ unsigned int stride_rows,
+ unsigned int stride_cols,
+ unsigned int dilation_rows,
+ unsigned int dilation_cols,
+ unsigned int n_batches,
+ unsigned int input_rows,
+ unsigned int input_cols,
+ unsigned int input_channels,
+ unsigned int output_rows,
+ unsigned int output_cols,
+ unsigned int channel_multiplier,
+ PaddingValues padding,
+ arm_gemm::Activation activation,
+
+ const DepthwiseConfig *config)
: cpu_info(cpu_info),
kernel_rows(kernel_rows),
kernel_cols(kernel_cols),
@@ -95,20 +100,38 @@ struct DepthwiseArgs
{
}
- DepthwiseArgs(
- const CPUInfo *cpu_info,
- unsigned int kernel_rows, unsigned int kernel_cols,
- unsigned int stride_rows, unsigned int stride_cols,
- unsigned int n_batches, unsigned int input_rows, unsigned int input_cols,
- unsigned int input_channels,
- unsigned int output_rows, unsigned int output_cols,
- unsigned int channel_multiplier,
- PaddingValues padding, arm_gemm::Activation activation,
- const DepthwiseConfig *config)
- : DepthwiseArgs(cpu_info, kernel_rows, kernel_cols, stride_rows,
- stride_cols, 1, 1, n_batches, input_rows, input_cols,
- input_channels, output_rows, output_cols,
- channel_multiplier, padding, activation, config)
+ DepthwiseArgs(const CPUInfo *cpu_info,
+ unsigned int kernel_rows,
+ unsigned int kernel_cols,
+ unsigned int stride_rows,
+ unsigned int stride_cols,
+ unsigned int n_batches,
+ unsigned int input_rows,
+ unsigned int input_cols,
+ unsigned int input_channels,
+ unsigned int output_rows,
+ unsigned int output_cols,
+ unsigned int channel_multiplier,
+ PaddingValues padding,
+ arm_gemm::Activation activation,
+ const DepthwiseConfig *config)
+ : DepthwiseArgs(cpu_info,
+ kernel_rows,
+ kernel_cols,
+ stride_rows,
+ stride_cols,
+ 1,
+ 1,
+ n_batches,
+ input_rows,
+ input_cols,
+ input_channels,
+ output_rows,
+ output_cols,
+ channel_multiplier,
+ padding,
+ activation,
+ config)
{
}
};
@@ -127,17 +150,18 @@ struct Tile
{
}
- Tile()
- : Tile(nullptr, 0, 0, 0)
+ Tile() : Tile(nullptr, 0, 0, 0)
{
}
- void load_from(
- const TInput *input,
- const unsigned int ld_row, const unsigned int ld_col,
- const unsigned int n_rows, const unsigned int n_cols,
- const int input_i, const int input_j,
- const unsigned int channel_multiplier) const
+ void load_from(const TInput *input,
+ const unsigned int ld_row,
+ const unsigned int ld_col,
+ const unsigned int n_rows,
+ const unsigned int n_cols,
+ const int input_i,
+ const int input_j,
+ const unsigned int channel_multiplier) const
{
const auto pad_top = input_i < 0 ? -input_i : 0;
const auto pad_left = input_j < 0 ? -input_j : 0;
@@ -145,18 +169,15 @@ struct Tile
const auto padded_rows = std::min(n_rows - input_i, tile_rows) - pad_top;
const auto padded_cols = std::min(n_cols - input_j, tile_cols) - pad_left;
- if(padded_rows < tile_rows || padded_cols < tile_cols)
+ if (padded_rows < tile_rows || padded_cols < tile_cols)
{
memset(array, 0, tile_rows * tile_cols * tile_channels * sizeof(TInput));
}
- do_premultiply<TInput>(
- (TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col,
- ld_row, ld_col,
- array + pad_top * tile_cols * tile_channels + pad_left * tile_channels,
- tile_cols * tile_channels, tile_channels,
- padded_rows, padded_cols, tile_channels / channel_multiplier,
- channel_multiplier);
+ do_premultiply<TInput>((TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col, ld_row,
+ ld_col, array + pad_top * tile_cols * tile_channels + pad_left * tile_channels,
+ tile_cols * tile_channels, tile_channels, padded_rows, padded_cols,
+ tile_channels / channel_multiplier, channel_multiplier);
}
};
@@ -168,9 +189,8 @@ protected:
std::string m_name{};
public:
- DepthwiseCommon(const DepthwiseArgs &args)
- : m_args(args) {};
- DepthwiseCommon(DepthwiseCommon &) = delete;
+ DepthwiseCommon(const DepthwiseArgs &args) : m_args(args){};
+ DepthwiseCommon(DepthwiseCommon &) = delete;
DepthwiseCommon &operator=(DepthwiseCommon &) = delete;
std::string name() const override
@@ -181,19 +201,18 @@ public:
void set_name(std::string name)
{
// Only allow the name to be set once
- if(m_name.empty())
+ if (m_name.empty())
{
m_name = name;
}
}
- void execute(
- const void *const input,
- const void *const parameters,
- void *const output,
- void *const working_space,
- const unsigned int thread_id,
- const unsigned int n_threads) const override final
+ void execute(const void *const input,
+ const void *const parameters,
+ void *const output,
+ void *const working_space,
+ const unsigned int thread_id,
+ const unsigned int n_threads) const override final
{
const size_t ld_input_col = m_args.input_channels;
const size_t ld_input_row = ld_input_col * m_args.input_cols;
@@ -202,56 +221,47 @@ public:
const size_t ld_output_row = ld_output_col * m_args.output_cols;
const size_t ld_output_batch = ld_output_row * m_args.output_rows;
- execute(
- input, ld_input_col, ld_input_row, ld_input_batch,
- parameters, output, ld_output_col, ld_output_row, ld_output_batch,
- working_space, thread_id, n_threads);
+ execute(input, ld_input_col, ld_input_row, ld_input_batch, parameters, output, ld_output_col, ld_output_row,
+ ld_output_batch, working_space, thread_id, n_threads);
}
- void execute(
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const void *const parameters,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *const working_space,
- const unsigned int thread_id,
- const unsigned int n_threads) const override final
+ void execute(const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *const parameters,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *const working_space,
+ const unsigned int thread_id,
+ const unsigned int n_threads) const override final
{
- execute(
- m_args.n_batches, m_args.input_rows, m_args.input_cols,
- m_args.input_channels, m_args.padding,
- input, ld_input_col, ld_input_row, ld_input_batch,
- parameters,
- m_args.output_rows, m_args.output_cols,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space, thread_id, n_threads);
+ execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.input_channels, m_args.padding, input,
+ ld_input_col, ld_input_row, ld_input_batch, parameters, m_args.output_rows, m_args.output_cols, output,
+ ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, n_threads);
}
- void execute(
- unsigned int batches,
- unsigned int input_height,
- unsigned int input_width,
- unsigned int channels,
- const PaddingValues &padding,
- const void *input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const void *parameters,
- unsigned int output_height,
- unsigned int output_width,
- void *output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int n_threads) const override final
+ void execute(unsigned int batches,
+ unsigned int input_height,
+ unsigned int input_width,
+ unsigned int channels,
+ const PaddingValues &padding,
+ const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const override final
{
// Construct a new set of arguments to reflect that we might have been
// passed different input/output tensors. Dilation is handled at this
@@ -271,38 +281,33 @@ public:
auto ld_output_col_d = ld_output_col * m_args.dilation_cols;
auto ld_output_row_d = ld_output_row * m_args.dilation_rows;
- for(size_t drow = 0; drow < m_args.dilation_rows; drow++)
+ for (size_t drow = 0; drow < m_args.dilation_rows; drow++)
{
size_t start_i;
- std::tie(args.output_rows, args.input_rows, start_i,
- args.padding.top, args.padding.bottom) =
- get_reduced_view_for_dilation(
- output_height, input_height, drow, m_args.dilation_rows,
- m_args.kernel_rows, m_args.stride_rows, padding.top);
+ std::tie(args.output_rows, args.input_rows, start_i, args.padding.top, args.padding.bottom) =
+ get_reduced_view_for_dilation(output_height, input_height, drow, m_args.dilation_rows,
+ m_args.kernel_rows, m_args.stride_rows, padding.top);
auto input_row = static_cast<const TInput *>(input) + start_i * ld_input_row;
auto output_row = static_cast<TOutput *>(output) + drow * ld_output_row;
- if(args.output_rows)
+ if (args.output_rows)
{
- for(size_t dcol = 0; dcol < m_args.dilation_cols; dcol++)
+ for (size_t dcol = 0; dcol < m_args.dilation_cols; dcol++)
{
size_t start_j;
- std::tie(args.output_cols, args.input_cols, start_j,
- args.padding.left, args.padding.right) =
- get_reduced_view_for_dilation(
- output_width, input_width, dcol, m_args.dilation_cols,
- m_args.kernel_cols, m_args.stride_cols, padding.left);
+ std::tie(args.output_cols, args.input_cols, start_j, args.padding.left, args.padding.right) =
+ get_reduced_view_for_dilation(output_width, input_width, dcol, m_args.dilation_cols,
+ m_args.kernel_cols, m_args.stride_cols, padding.left);
const TInput *input_col = input_row + start_j * ld_input_col;
TOutput *output_col = output_row + dcol * ld_output_col;
- if(args.output_cols)
+ if (args.output_cols)
{
- this->execute_internal(
- args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch, parameters,
- output_col, ld_output_col_d, ld_output_row_d, ld_output_batch,
- working_space, thread_id, n_threads);
+ this->execute_internal(args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch,
+ parameters, output_col, ld_output_col_d, ld_output_row_d,
+ ld_output_batch, working_space, thread_id, n_threads);
}
}
}
@@ -310,20 +315,19 @@ public:
}
protected:
- virtual void execute_internal(
- const DepthwiseArgs &instance_args,
- const void *input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const void *parameters,
- void *output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int n_threads) const = 0;
+ virtual void execute_internal(const DepthwiseArgs &instance_args,
+ const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
virtual bool uses_premultiply() const
{
diff --git a/src/core/NEON/kernels/assembly/depthwise_common.hpp b/src/core/NEON/kernels/assembly/depthwise_common.hpp
index a5db793b3d..5ff848e281 100644
--- a/src/core/NEON/kernels/assembly/depthwise_common.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise_common.hpp
@@ -49,11 +49,7 @@ struct KernelDescription
bool is_default = false;
uint64_t cycle_estimate = 0;
- KernelDescription(
- DepthwiseMethod method,
- std::string name,
- bool is_default,
- uint64_t cycle_estimate)
+ KernelDescription(DepthwiseMethod method, std::string name, bool is_default, uint64_t cycle_estimate)
: method(method), name(name), is_default(is_default), cycle_estimate(cycle_estimate)
{
}
@@ -78,58 +74,51 @@ public:
// pointer the bias vector (which may be nullptr in the case of no bias) and
// a pointer to the array of weights (stored in HWIO order).
virtual void pack_parameters(
- void *buffer,
- const void *biases,
- const void *weights,
- size_t ld_weight_col = 0,
- size_t ld_weight_row = 0) = 0;
+ void *buffer, const void *biases, const void *weights, size_t ld_weight_col = 0, size_t ld_weight_row = 0) = 0;
// Determine the amount of working space required
virtual size_t get_working_size(unsigned int n_threads) const = 0;
// Execute the convolution over the specified area of memory.
- virtual void execute(
- const void *input, // Pointer to input tensor
- const void *parameters, // Packed parameters buffer
- void *output,
- void *working_space,
- unsigned int thread_id,
- unsigned int n_threads) const = 0;
-
- virtual void execute(
- const void *input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const void *parameters,
- void *output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int n_threads) const = 0;
-
- virtual void execute(
- unsigned int batches,
- unsigned int input_height,
- unsigned int input_width,
- unsigned int channels,
- const PaddingValues &,
- const void *input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const void *parameters,
- unsigned int output_height,
- unsigned int output_width,
- void *output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int n_threads) const = 0;
+ virtual void execute(const void *input, // Pointer to input tensor
+ const void *parameters, // Packed parameters buffer
+ void *output,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
+
+ virtual void execute(const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
+
+ virtual void execute(unsigned int batches,
+ unsigned int input_height,
+ unsigned int input_width,
+ unsigned int channels,
+ const PaddingValues &,
+ const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
};
// To handle a dilation factor of D execute the kernel once for each d in
@@ -145,12 +134,13 @@ public:
// - Number of valid input pixels corresponding to `d`
// - Offset of the first pixel corresponding to `d`
// - Amount of padding in the view for `d`
-std::tuple<size_t, size_t, size_t, size_t, size_t>
-get_reduced_view_for_dilation(
- size_t out_size, size_t in_size,
- size_t d, size_t dilation_factor,
- size_t kernel_size, size_t stride,
- size_t pad_before);
+std::tuple<size_t, size_t, size_t, size_t, size_t> get_reduced_view_for_dilation(size_t out_size,
+ size_t in_size,
+ size_t d,
+ size_t dilation_factor,
+ size_t kernel_size,
+ size_t stride,
+ size_t pad_before);
} // namespace depthwise
} // namespace arm_conv
diff --git a/src/core/NEON/kernels/assembly/pool_common.hpp b/src/core/NEON/kernels/assembly/pool_common.hpp
index f1f70cf1d6..045f9f95d3 100644
--- a/src/core/NEON/kernels/assembly/pool_common.hpp
+++ b/src/core/NEON/kernels/assembly/pool_common.hpp
@@ -68,45 +68,42 @@ public:
virtual size_t get_working_size(unsigned int num_threads) const = 0;
// Execute pooling over the specified area of memory.
- virtual void execute(
- const void *const input,
- void *const output,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const = 0;
+ virtual void execute(const void *const input,
+ void *const output,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const = 0;
- virtual void execute(
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const = 0;
+ virtual void execute(const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const = 0;
- virtual void execute(
- unsigned int batches,
- unsigned int height,
- unsigned int width,
- unsigned int channels,
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const PaddingValues &,
- unsigned int output_height,
- unsigned int output_width,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const = 0;
+ virtual void execute(unsigned int batches,
+ unsigned int height,
+ unsigned int width,
+ unsigned int channels,
+ const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const PaddingValues &,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const = 0;
};
} // namespace pooling
diff --git a/src/core/NEON/kernels/assembly/pooling.hpp b/src/core/NEON/kernels/assembly/pooling.hpp
index e8db35c593..89d594298e 100644
--- a/src/core/NEON/kernels/assembly/pooling.hpp
+++ b/src/core/NEON/kernels/assembly/pooling.hpp
@@ -36,9 +36,8 @@ struct PoolingConfig
PoolingMethod method = PoolingMethod::DEFAULT;
std::string filter = "";
- PoolingConfig(PoolingMethod method)
- : method(method) {};
- PoolingConfig() {};
+ PoolingConfig(PoolingMethod method) : method(method){};
+ PoolingConfig(){};
};
struct PoolingArgs
@@ -57,30 +56,40 @@ struct PoolingArgs
const PoolingConfig *config;
- PoolingArgs(
- const CPUInfo *cpu_info,
- PoolingType pool_type,
- const PoolingWindow &window,
- const PoolingStride &stride,
- bool exclude_padding,
- unsigned int n_batches,
- unsigned int input_rows,
- unsigned int input_cols,
- unsigned int n_channels,
- unsigned int output_rows,
- unsigned int output_cols,
- const PaddingValues &padding,
- const PoolingConfig *cfg)
- : cpu_info(cpu_info), pool_type(pool_type), pool_window(window), pool_stride(stride), exclude_padding(exclude_padding), n_batches(n_batches), input_rows(input_rows), input_cols(input_cols),
- n_channels(n_channels), output_rows(output_rows), output_cols(output_cols), padding(padding), config(cfg)
+ PoolingArgs(const CPUInfo *cpu_info,
+ PoolingType pool_type,
+ const PoolingWindow &window,
+ const PoolingStride &stride,
+ bool exclude_padding,
+ unsigned int n_batches,
+ unsigned int input_rows,
+ unsigned int input_cols,
+ unsigned int n_channels,
+ unsigned int output_rows,
+ unsigned int output_cols,
+ const PaddingValues &padding,
+ const PoolingConfig *cfg)
+ : cpu_info(cpu_info),
+ pool_type(pool_type),
+ pool_window(window),
+ pool_stride(stride),
+ exclude_padding(exclude_padding),
+ n_batches(n_batches),
+ input_rows(input_rows),
+ input_cols(input_cols),
+ n_channels(n_channels),
+ output_rows(output_rows),
+ output_cols(output_cols),
+ padding(padding),
+ config(cfg)
{
// If either of the pooling window dimensions are set to zero, meaning
// "pool everything", then replace with the corresponding input dimension.
- if(pool_window.rows == 0)
+ if (pool_window.rows == 0)
{
pool_window.rows = input_rows;
}
- if(pool_window.cols == 0)
+ if (pool_window.cols == 0)
{
pool_window.cols = input_cols;
}
@@ -100,10 +109,16 @@ struct Requantize32
int32_t per_layer_right_shift = 0;
int32_t per_layer_mul = 0;
- Requantize32(int32_t input_offset, int32_t output_offset,
- int32_t per_layer_left_shift, int32_t per_layer_right_shift,
+ Requantize32(int32_t input_offset,
+ int32_t output_offset,
+ int32_t per_layer_left_shift,
+ int32_t per_layer_right_shift,
int32_t per_layer_mul)
- : input_offset(input_offset), output_offset(output_offset), per_layer_left_shift(per_layer_left_shift), per_layer_right_shift(per_layer_right_shift), per_layer_mul(per_layer_mul)
+ : input_offset(input_offset),
+ output_offset(output_offset),
+ per_layer_left_shift(per_layer_left_shift),
+ per_layer_right_shift(per_layer_right_shift),
+ per_layer_mul(per_layer_mul)
{
}
};
@@ -115,105 +130,88 @@ protected:
const PoolingArgs m_args;
public:
- PoolingCommon(const PoolingArgs &args)
- : m_args(args)
+ PoolingCommon(const PoolingArgs &args) : m_args(args)
{
}
- PoolingCommon(PoolingCommon &) = delete;
+ PoolingCommon(PoolingCommon &) = delete;
PoolingCommon &operator=(PoolingCommon &) = delete;
size_t get_working_size(unsigned int) const override = 0;
// Execute pooling over the specified area of memory.
- void execute(
- const void *const input,
- void *const output,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const override
+ void execute(const void *const input,
+ void *const output,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const override
{
- this->execute(
- input,
- m_args.n_channels,
- m_args.n_channels * m_args.input_cols,
- m_args.n_channels * m_args.input_cols * m_args.input_rows,
- output,
- m_args.n_channels,
- m_args.n_channels * m_args.output_cols,
- m_args.n_channels * m_args.output_cols * m_args.output_rows,
- working_space,
- thread_id, num_threads);
+ this->execute(input, m_args.n_channels, m_args.n_channels * m_args.input_cols,
+ m_args.n_channels * m_args.input_cols * m_args.input_rows, output, m_args.n_channels,
+ m_args.n_channels * m_args.output_cols,
+ m_args.n_channels * m_args.output_cols * m_args.output_rows, working_space, thread_id,
+ num_threads);
}
- void execute(
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const override
+ void execute(const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const override
{
- this->execute(
- m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels,
- input, ld_input_col, ld_input_row, ld_input_batch,
- m_args.padding, m_args.output_rows, m_args.output_cols,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space, thread_id, num_threads);
+ this->execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels, input, ld_input_col,
+ ld_input_row, ld_input_batch, m_args.padding, m_args.output_rows, m_args.output_cols, output,
+ ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, num_threads);
}
- void execute(
- unsigned int batches,
- unsigned int height,
- unsigned int width,
- unsigned int channels,
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const PaddingValues &padding,
- unsigned int output_height,
- unsigned int output_width,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const override
+ void execute(unsigned int batches,
+ unsigned int height,
+ unsigned int width,
+ unsigned int channels,
+ const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const PaddingValues &padding,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const override
{
- this->execute_internal(
- batches, height, width, channels, padding,
- input, ld_input_col, ld_input_row, ld_input_batch,
- output_height, output_width,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space, thread_id, num_threads);
+ this->execute_internal(batches, height, width, channels, padding, input, ld_input_col, ld_input_row,
+ ld_input_batch, output_height, output_width, output, ld_output_col, ld_output_row,
+ ld_output_batch, working_space, thread_id, num_threads);
}
protected:
- virtual void execute_internal(
- unsigned int batches,
- unsigned int height,
- unsigned int width,
- unsigned int channels,
- const PaddingValues &,
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- unsigned int output_height,
- unsigned int output_width,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *working_space,
- unsigned int thread_id,
- unsigned int num_threads) const = 0;
+ virtual void execute_internal(unsigned int batches,
+ unsigned int height,
+ unsigned int width,
+ unsigned int channels,
+ const PaddingValues &,
+ const void *const input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *const output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int num_threads) const = 0;
};
template <typename TInput, typename TOutput>
diff --git a/src/core/NEON/kernels/assembly/premultiply.hpp b/src/core/NEON/kernels/assembly/premultiply.hpp
index 16f26de38a..fb97cf8baf 100644
--- a/src/core/NEON/kernels/assembly/premultiply.hpp
+++ b/src/core/NEON/kernels/assembly/premultiply.hpp
@@ -44,30 +44,27 @@ void do_premultiply(const T *in_ptr,
const unsigned input_channels,
const unsigned int channel_multiplier)
{
- if(sizeof(T) == 4 && channel_multiplier == 6)
+ if (sizeof(T) == 4 && channel_multiplier == 6)
{
- do_premultiply_float_6(
- (const float *)in_ptr, ld_row, ld_col,
- (float *)out_ptr, out_ld_row, out_ld_col,
- tile_rows, tile_cols,
- input_channels);
+ do_premultiply_float_6((const float *)in_ptr, ld_row, ld_col, (float *)out_ptr, out_ld_row, out_ld_col,
+ tile_rows, tile_cols, input_channels);
}
else
{
- for(unsigned int i = 0; i < tile_rows; i++)
+ for (unsigned int i = 0; i < tile_rows; i++)
{
const T *ip2 = in_ptr + i * ld_row;
T *op2 = out_ptr + i * out_ld_row;
- for(unsigned int j = 0; j < tile_cols; j++)
+ for (unsigned int j = 0; j < tile_cols; j++)
{
const T *ip = ip2;
T *op = op2;
- for(unsigned int c = 0; c < input_channels; c++)
+ for (unsigned int c = 0; c < input_channels; c++)
{
T val = *ip;
ip++;
- for(unsigned int r = 0; r < channel_multiplier; r++)
+ for (unsigned int r = 0; r < channel_multiplier; r++)
{
op[r] = val;
}
diff --git a/src/core/NEON/kernels/assembly/winograd.hpp b/src/core/NEON/kernels/assembly/winograd.hpp
index 50290757ec..dbf95d23cd 100644
--- a/src/core/NEON/kernels/assembly/winograd.hpp
+++ b/src/core/NEON/kernels/assembly/winograd.hpp
@@ -45,17 +45,24 @@ struct ConvolutionArgs
Shape2D kernel_shape;
arm_gemm::Activation activation;
- ConvolutionArgs(
- unsigned int n_batches,
- const Shape2D &input_shape,
- unsigned int n_input_channels,
- unsigned int pad_top, unsigned int pad_left,
- const Shape2D &output_shape,
- unsigned int n_output_channels,
- const Shape2D kernel_shape,
- const arm_gemm::Activation &activation = {})
- : n_batches(n_batches), input_shape(input_shape), n_input_channels(n_input_channels), pad_top(pad_top), pad_left(pad_left), output_shape(output_shape), n_output_channels(n_output_channels),
- kernel_shape(kernel_shape), activation(activation)
+ ConvolutionArgs(unsigned int n_batches,
+ const Shape2D &input_shape,
+ unsigned int n_input_channels,
+ unsigned int pad_top,
+ unsigned int pad_left,
+ const Shape2D &output_shape,
+ unsigned int n_output_channels,
+ const Shape2D kernel_shape,
+ const arm_gemm::Activation &activation = {})
+ : n_batches(n_batches),
+ input_shape(input_shape),
+ n_input_channels(n_input_channels),
+ pad_top(pad_top),
+ pad_left(pad_left),
+ output_shape(output_shape),
+ n_output_channels(n_output_channels),
+ kernel_shape(kernel_shape),
+ activation(activation)
{
}
};
@@ -105,23 +112,30 @@ public:
virtual unsigned int get_transformed_tile_rows(void) const = 0;
virtual unsigned int get_transformed_tile_cols(void) const = 0;
- void execute(
- const ConvolutionArgs &args,
- const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel,
- void *outptr, const WinogradDomainSpec &wds,
- unsigned int thread_id, unsigned int n_threads) const
+ void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_input_channel,
+ void *outptr,
+ const WinogradDomainSpec &wds,
+ unsigned int thread_id,
+ unsigned int n_threads) const
{
- this->execute(
- args, inptr, ld_in_row, ld_in_col, ld_input_channel,
- outptr, wds.weight_ld_matrix, wds.weight_ld_row,
- thread_id, n_threads);
+ this->execute(args, inptr, ld_in_row, ld_in_col, ld_input_channel, outptr, wds.weight_ld_matrix,
+ wds.weight_ld_row, thread_id, n_threads);
}
- virtual void execute(
- const ConvolutionArgs &args,
- const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel,
- void *outptr, size_t ld_out_matrix, size_t ld_out_row,
- unsigned int thread_id, unsigned int n_threads) const = 0;
+ virtual void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_input_channel,
+ void *outptr,
+ size_t ld_out_matrix,
+ size_t ld_out_row,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
};
} // namespace weight_transform
@@ -136,27 +150,35 @@ public:
virtual unsigned int get_input_rows(void) const = 0;
virtual unsigned int get_input_cols(void) const = 0;
- virtual size_t get_working_space_size(
- const ConvolutionArgs &args,
- unsigned int n_threads) const = 0;
-
- void execute(
- const ConvolutionArgs &args,
- const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
- void *outptr, const WinogradDomainSpec &wds,
- void *working_space, unsigned int thread_id, unsigned int n_threads) const
+ virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0;
+
+ void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ size_t ld_in_batch,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ void *outptr,
+ const WinogradDomainSpec &wds,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const
{
- this->execute(
- args, inptr, ld_in_batch, ld_in_row, ld_in_col,
- outptr, wds.input_ld_batch, wds.input_ld_matrix, wds.input_ld_row,
- working_space, thread_id, n_threads);
+ this->execute(args, inptr, ld_in_batch, ld_in_row, ld_in_col, outptr, wds.input_ld_batch, wds.input_ld_matrix,
+ wds.input_ld_row, working_space, thread_id, n_threads);
}
- virtual void execute(
- const ConvolutionArgs &args,
- const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
- void *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row,
- void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0;
+ virtual void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ size_t ld_in_batch,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ void *outptr,
+ size_t ld_out_batch,
+ size_t ld_out_matrix,
+ size_t ld_out_row,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
};
} // namespace input_transform
@@ -177,31 +199,37 @@ public:
virtual unsigned int get_kernel_rows(void) const = 0;
virtual unsigned int get_kernel_cols(void) const = 0;
- virtual size_t get_working_space_size(
- const ConvolutionArgs &args,
- unsigned int n_threads) const = 0;
-
- void execute(
- const ConvolutionArgs &args,
- const void *inptr, const WinogradDomainSpec &wds,
- const void *bias,
- void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
- void *working_space, unsigned int thread_id, unsigned int n_threads) const
+ virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0;
+
+ void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ const WinogradDomainSpec &wds,
+ const void *bias,
+ void *outptr,
+ size_t ld_out_batch,
+ size_t ld_out_row,
+ size_t ld_out_col,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const
{
- this->execute(
- args,
- inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row,
- bias,
- outptr, ld_out_batch, ld_out_row, ld_out_col,
- working_space, thread_id, n_threads);
+ this->execute(args, inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row, bias, outptr,
+ ld_out_batch, ld_out_row, ld_out_col, working_space, thread_id, n_threads);
}
- virtual void execute(
- const ConvolutionArgs &args,
- const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row,
- const void *bias,
- void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
- void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0;
+ virtual void execute(const ConvolutionArgs &args,
+ const void *inptr,
+ size_t ld_in_batch,
+ size_t ld_in_matrix,
+ size_t ld_in_row,
+ const void *bias,
+ void *outptr,
+ size_t ld_out_batch,
+ size_t ld_out_row,
+ size_t ld_out_col,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const = 0;
};
} // namespace output_transform
@@ -210,7 +238,7 @@ struct WinogradImpl
{
const output_transform::ITransform *output_transform = nullptr;
const weight_transform::ITransform *weight_transform = nullptr;
- const input_transform::ITransform *input_transform = nullptr;
+ const input_transform::ITransform *input_transform = nullptr;
std::unique_ptr<arm_gemm::GemmArgs> gemm_args;
WinogradDomainSpec winograd_spec;
};
@@ -220,15 +248,18 @@ struct WinogradImpl
* Assigns to the pointers in the `dest` struct and returns true or false to
* indicate whether the given problem can be executed or not.
*/
-template <typename TIn, typename TWeight = TIn, typename TOut = TIn, typename TWinogradIn = TIn, typename TWinogradOut = TOut>
-bool get_implementation(
- WinogradImpl &dest, // Destination for the selected implementation
- const CPUInfo *,
- const ConvolutionArgs &,
- int max_threads,
- bool fast_mode,
- const WinogradConfig *,
- const arm_gemm::GemmConfig *);
+template <typename TIn,
+ typename TWeight = TIn,
+ typename TOut = TIn,
+ typename TWinogradIn = TIn,
+ typename TWinogradOut = TOut>
+bool get_implementation(WinogradImpl &dest, // Destination for the selected implementation
+ const CPUInfo *,
+ const ConvolutionArgs &,
+ int max_threads,
+ bool fast_mode,
+ const WinogradConfig *,
+ const arm_gemm::GemmConfig *);
} // namespace winograd
} // namespace arm_conv
diff --git a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
index ed5254a0a4..e3d9b670b3 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
@@ -24,8 +24,9 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
+
#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
@@ -37,12 +38,26 @@ namespace arm_compute
{
namespace
{
-using BatchNomalizationPtr = void (*)(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
- float epsilon, ActivationLayerInfo &act_info, const Window &window);
+using BatchNomalizationPtr = void (*)(ITensor *src,
+ ITensor *dst,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta,
+ const ITensor *gamma,
+ float epsilon,
+ ActivationLayerInfo &act_info,
+ const Window &window);
template <typename T>
-void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
- float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void batch_normalization(ITensor *src,
+ ITensor *dst,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta,
+ const ITensor *gamma,
+ float epsilon,
+ ActivationLayerInfo &act_info,
+ const Window &window)
{
/** SIMD vector tag type. */
using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>;
@@ -57,86 +72,99 @@ void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const
Iterator input(src, win_collapsed);
Iterator output(dst, win_collapsed);
- const auto input_mean = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
- const auto input_var = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
- const auto input_beta = (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_mean = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
+ const auto input_var = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma =
+ (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_beta =
+ (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
T activation_functor(act_info);
const auto epsilon_vec = wrapper::vdup_n(static_cast<float16_t>(epsilon), ExactTagType{});
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
- // Perform core calculations using vector operations
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- // Conctruct vectors
- const auto mean_vec = wrapper::vloadq(input_mean + x);
- const auto var_vec = wrapper::vloadq(input_var + x);
- const auto gamma_vec = (input_gamma != nullptr) ? wrapper::vloadq(input_gamma + x) : wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{});
- const auto beta_vec = (input_beta != nullptr) ? wrapper::vloadq(input_beta + x) : wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{});
-
- // Calculate denominator
- const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-
- // Calculate x bar
- const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
- const auto x_bar = wrapper::vmul(numerator, denominator);
- auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec);
-
- // Perform fused activation
- if(act_info.enabled())
+ const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+ // Perform core calculations using vector operations
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- activation_functor(res);
+ // Conctruct vectors
+ const auto mean_vec = wrapper::vloadq(input_mean + x);
+ const auto var_vec = wrapper::vloadq(input_var + x);
+ const auto gamma_vec = (input_gamma != nullptr)
+ ? wrapper::vloadq(input_gamma + x)
+ : wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{});
+ const auto beta_vec = (input_beta != nullptr)
+ ? wrapper::vloadq(input_beta + x)
+ : wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{});
+
+ // Calculate denominator
+ const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+
+ // Calculate x bar
+ const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+ const auto x_bar = wrapper::vmul(numerator, denominator);
+ auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+ // Perform fused activation
+ if (act_info.enabled())
+ {
+ activation_functor(res);
+ }
+
+ // Store results
+ wrapper::vstore(output_ptr + x, res);
}
- // Store results
- wrapper::vstore(output_ptr + x, res);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- // Conctruct vectors
- const float16_t gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
- const float16_t beta = (input_beta != nullptr) ? input_beta[x] : 0.f;
-
- const float16_t denominator = sqrt(input_var[x] + epsilon);
- const float16_t numerator = input_ptr[x] - input_mean[x];
- const float16_t x_bar = numerator / denominator;
- float16_t res = beta + x_bar * gamma;
-
- // Perform fused activation
- if(act_info.enabled())
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- activation_functor(res);
+ // Conctruct vectors
+ const float16_t gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
+ const float16_t beta = (input_beta != nullptr) ? input_beta[x] : 0.f;
+
+ const float16_t denominator = sqrt(input_var[x] + epsilon);
+ const float16_t numerator = input_ptr[x] - input_mean[x];
+ const float16_t x_bar = numerator / denominator;
+ float16_t res = beta + x_bar * gamma;
+
+ // Perform fused activation
+ if (act_info.enabled())
+ {
+ activation_functor(res);
+ }
+
+ // Store results
+ *reinterpret_cast<float16_t *>(output_ptr + x) = res;
}
-
- // Store results
- *reinterpret_cast<float16_t *>(output_ptr + x) = res;
- }
- },
- input, output);
+ },
+ input, output);
}
// Fused Batched Normalization with activation functions
-static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map =
-{
- { ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float16_t, 8>> },
- { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float16_t, 8>> },
- { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float16_t, 8>> }
-};
-}
+static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map = {
+ {ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float16_t, 8>>},
+ {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float16_t, 8>>},
+ {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float16_t, 8>>}};
+} // namespace
namespace cpu
{
-void fp16_neon_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
- float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp16_neon_batch_normalization(ITensor *src,
+ ITensor *dst,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta,
+ const ITensor *gamma,
+ float epsilon,
+ ActivationLayerInfo &act_info,
+ const Window &window)
{
- if(act_info.enabled())
+ if (act_info.enabled())
{
fused_map[act_info.activation()](src, dst, mean, var, beta, gamma, epsilon, act_info, window);
}
diff --git a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
index d6e22e1843..4e1654ee6b 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
@@ -24,8 +24,9 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
+
#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEMath.h"
#include "src/core/NEON/wrapper/wrapper.h"
#include <arm_neon.h>
@@ -36,12 +37,26 @@ namespace arm_compute
{
namespace
{
-using BatchNomalizationPtr = void (*)(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
- float epsilon, ActivationLayerInfo &act_info, const Window &window);
+using BatchNomalizationPtr = void (*)(ITensor *src,
+ ITensor *dst,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta,
+ const ITensor *gamma,
+ float epsilon,
+ ActivationLayerInfo &act_info,
+ const Window &window);
template <typename T>
-void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
- float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void batch_normalization(ITensor *src,
+ ITensor *dst,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta,
+ const ITensor *gamma,
+ float epsilon,
+ ActivationLayerInfo &act_info,
+ const Window &window)
{
/** SIMD vector tag type. */
using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
@@ -56,86 +71,99 @@ void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const
Iterator input(src, win_collapsed);
Iterator output(dst, win_collapsed);
- const auto input_mean = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
- const auto input_var = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
- const auto input_beta = (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_mean = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
+ const auto input_var = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma =
+ (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_beta =
+ (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
T activation_functor(act_info);
const auto epsilon_vec = wrapper::vdup_n(static_cast<float>(epsilon), ExactTagType{});
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
- const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
- // Perform core calculations using vector operations
- int x = window_start_x;
- for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- // Conctruct vectors
- const auto mean_vec = wrapper::vloadq(input_mean + x);
- const auto var_vec = wrapper::vloadq(input_var + x);
- const auto gamma_vec = (input_gamma != nullptr) ? wrapper::vloadq(input_gamma + x) : wrapper::vdup_n(static_cast<float>(1.f), ExactTagType{});
- const auto beta_vec = (input_beta != nullptr) ? wrapper::vloadq(input_beta + x) : wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
-
- // Calculate denominator
- const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-
- // Calculate x bar
- const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
- const auto x_bar = wrapper::vmul(numerator, denominator);
- auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec);
-
- // Perform fused activation
- if(act_info.enabled())
+ const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+
+ // Perform core calculations using vector operations
+ int x = window_start_x;
+ for (; x <= (window_end_x - window_step_x); x += window_step_x)
{
- activation_functor(res);
+ // Conctruct vectors
+ const auto mean_vec = wrapper::vloadq(input_mean + x);
+ const auto var_vec = wrapper::vloadq(input_var + x);
+ const auto gamma_vec = (input_gamma != nullptr)
+ ? wrapper::vloadq(input_gamma + x)
+ : wrapper::vdup_n(static_cast<float>(1.f), ExactTagType{});
+ const auto beta_vec = (input_beta != nullptr)
+ ? wrapper::vloadq(input_beta + x)
+ : wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
+
+ // Calculate denominator
+ const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+
+ // Calculate x bar
+ const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+ const auto x_bar = wrapper::vmul(numerator, denominator);
+ auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+ // Perform fused activation
+ if (act_info.enabled())
+ {
+ activation_functor(res);
+ }
+
+ // Store results
+ wrapper::vstore(output_ptr + x, res);
}
- // Store results
- wrapper::vstore(output_ptr + x, res);
- }
-
- // Compute left-over elements
- for(; x < window_end_x; ++x)
- {
- // Conctruct vectors
- const float gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
- const float beta = (input_beta != nullptr) ? input_beta[x] : 0.f;
-
- const float denominator = sqrt(input_var[x] + epsilon);
- const float numerator = input_ptr[x] - input_mean[x];
- const float x_bar = numerator / denominator;
- float res = beta + x_bar * gamma;
-
- // Perform fused activation
- if(act_info.enabled())
+ // Compute left-over elements
+ for (; x < window_end_x; ++x)
{
- activation_functor(res);
+ // Conctruct vectors
+ const float gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
+ const float beta = (input_beta != nullptr) ? input_beta[x] : 0.f;
+
+ const float denominator = sqrt(input_var[x] + epsilon);
+ const float numerator = input_ptr[x] - input_mean[x];
+ const float x_bar = numerator / denominator;
+ float res = beta + x_bar * gamma;
+
+ // Perform fused activation
+ if (act_info.enabled())
+ {
+ activation_functor(res);
+ }
+
+ // Store results
+ *reinterpret_cast<float *>(output_ptr + x) = res;
}
-
- // Store results
- *reinterpret_cast<float *>(output_ptr + x) = res;
- }
- },
- input, output);
+ },
+ input, output);
}
// Fused Batched Normalization with activation functions
-static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map =
-{
- { ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float, 4>> },
- { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float, 4>> },
- { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float, 4>> }
-};
-}
+static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map = {
+ {ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float, 4>>},
+ {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float, 4>>},
+ {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float, 4>>}};
+} // namespace
namespace cpu
{
-void fp32_neon_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
- float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp32_neon_batch_normalization(ITensor *src,
+ ITensor *dst,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta,
+ const ITensor *gamma,
+ float epsilon,
+ ActivationLayerInfo &act_info,
+ const Window &window)
{
- if(act_info.enabled())
+ if (act_info.enabled())
{
fused_map[act_info.activation()](src, dst, mean, var, beta, gamma, epsilon, act_info, window);
}
diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
index 98cd9aa7fe..48caaa3e63 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/Window.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/NEON/SVEMath.h"
#include <cmath>
@@ -37,8 +38,15 @@ namespace arm_compute
{
namespace cpu
{
-void fp16_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
- float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp16_sve_batch_normalization(ITensor *src,
+ ITensor *dst,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta,
+ const ITensor *gamma,
+ float epsilon,
+ ActivationLayerInfo &act_info,
+ const Window &window)
{
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
@@ -49,69 +57,74 @@ void fp16_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mea
Iterator input(src, win_collapsed);
Iterator output(dst, win_collapsed);
- const auto input_mean = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
- const auto input_var = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
- const auto input_beta = (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_mean = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
+ const auto input_var = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma =
+ (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_beta =
+ (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
const auto epsilon_vec = svdup_n_f16(epsilon);
const auto const_1 = svdup_n_f16(1.f);
const auto const_0 = svdup_n_f16(0.f);
const auto va = svdup_n_f16(act_info.a());
const auto vb = svdup_n_f16(act_info.b());
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
- const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- svbool_t pg = svwhilelt_b16(x, window_end_x);
- do
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- // Conctruct vectors
- const auto mean_vec = svld1_f16(pg, input_mean + x);
- const auto var_vec = svld1_f16(pg, input_var + x);
- const auto gamma_vec = (input_gamma != nullptr) ? svld1_f16(pg, input_gamma + x) : const_1;
- const auto beta_vec = (input_beta != nullptr) ? svld1_f16(pg, input_beta + x) : const_0;
+ const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
- // Calculate denominator
- const auto tmp = svadd_f16_z(pg, var_vec, epsilon_vec);
- auto denominator = svrsqrte_f16(tmp);
- denominator = svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
- denominator = svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
+ // Compute S elements per iteration
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b16(x, window_end_x);
+ do
+ {
+ // Conctruct vectors
+ const auto mean_vec = svld1_f16(pg, input_mean + x);
+ const auto var_vec = svld1_f16(pg, input_var + x);
+ const auto gamma_vec = (input_gamma != nullptr) ? svld1_f16(pg, input_gamma + x) : const_1;
+ const auto beta_vec = (input_beta != nullptr) ? svld1_f16(pg, input_beta + x) : const_0;
- // Calculate x bar
- const auto numerator = svsub_f16_z(pg, svld1_f16(pg, input_ptr + x), mean_vec);
- const auto x_bar = svmul_f16_z(pg, numerator, denominator);
- auto res = svmla_f16_z(pg, beta_vec, x_bar, gamma_vec);
+ // Calculate denominator
+ const auto tmp = svadd_f16_z(pg, var_vec, epsilon_vec);
+ auto denominator = svrsqrte_f16(tmp);
+ denominator =
+ svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
+ denominator =
+ svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
- // Perform fused activation
- if(act_info.enabled())
- {
- if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
- {
- res = svmax_f16_z(pg, const_0, res);
- }
- else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- {
- res = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, res));
- }
- else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ // Calculate x bar
+ const auto numerator = svsub_f16_z(pg, svld1_f16(pg, input_ptr + x), mean_vec);
+ const auto x_bar = svmul_f16_z(pg, numerator, denominator);
+ auto res = svmla_f16_z(pg, beta_vec, x_bar, gamma_vec);
+
+ // Perform fused activation
+ if (act_info.enabled())
{
- res = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, res));
+ if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+ {
+ res = svmax_f16_z(pg, const_0, res);
+ }
+ else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ {
+ res = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, res));
+ }
+ else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ res = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, res));
+ }
}
- }
- // Store results
- svst1_f16(pg, output_ptr + x, res);
+ // Store results
+ svst1_f16(pg, output_ptr + x, res);
- x += svcntw();
- pg = svwhilelt_b16(x, window_end_x);
- }
- while(svptest_any(svptrue_b16(), pg));
- },
- input, output);
+ x += svcntw();
+ pg = svwhilelt_b16(x, window_end_x);
+ } while (svptest_any(svptrue_b16(), pg));
+ },
+ input, output);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
index 952ab320bf..df4fbfe607 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
@@ -25,6 +25,7 @@
#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/Window.h"
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/NEON/SVEMath.h"
#include <cmath>
@@ -37,8 +38,15 @@ namespace arm_compute
{
namespace cpu
{
-void fp32_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
- float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp32_sve_batch_normalization(ITensor *src,
+ ITensor *dst,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta,
+ const ITensor *gamma,
+ float epsilon,
+ ActivationLayerInfo &act_info,
+ const Window &window)
{
const auto window_start_x = static_cast<int>(window.x().start());
const auto window_end_x = static_cast<int>(window.x().end());
@@ -49,69 +57,74 @@ void fp32_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mea
Iterator input(src, win_collapsed);
Iterator output(dst, win_collapsed);
- const auto input_mean = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
- const auto input_var = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
- const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
- const auto input_beta = (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_mean = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
+ const auto input_var = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
+ const auto input_gamma =
+ (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+ const auto input_beta =
+ (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
const auto epsilon_vec = svdup_n_f32(epsilon);
const auto const_1 = svdup_n_f32(1.f);
const auto const_0 = svdup_n_f32(0.f);
const auto va = svdup_n_f32(act_info.a());
const auto vb = svdup_n_f32(act_info.b());
- execute_window_loop(win_collapsed, [&](const Coordinates &)
- {
- const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
- const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
- // Compute S elements per iteration
- int x = window_start_x;
- svbool_t pg = svwhilelt_b32(x, window_end_x);
- do
+ execute_window_loop(
+ win_collapsed,
+ [&](const Coordinates &)
{
- // Conctruct vectors
- const auto mean_vec = svld1_f32(pg, input_mean + x);
- const auto var_vec = svld1_f32(pg, input_var + x);
- const auto gamma_vec = (input_gamma != nullptr) ? svld1_f32(pg, input_gamma + x) : const_1;
- const auto beta_vec = (input_beta != nullptr) ? svld1_f32(pg, input_beta + x) : const_0;
+ const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<float *>(output.ptr());
- // Calculate denominator
- const auto tmp = svadd_f32_z(pg, var_vec, epsilon_vec);
- auto denominator = svrsqrte_f32(tmp);
- denominator = svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
- denominator = svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
+ // Compute S elements per iteration
+ int x = window_start_x;
+ svbool_t pg = svwhilelt_b32(x, window_end_x);
+ do
+ {
+ // Conctruct vectors
+ const auto mean_vec = svld1_f32(pg, input_mean + x);
+ const auto var_vec = svld1_f32(pg, input_var + x);
+ const auto gamma_vec = (input_gamma != nullptr) ? svld1_f32(pg, input_gamma + x) : const_1;
+ const auto beta_vec = (input_beta != nullptr) ? svld1_f32(pg, input_beta + x) : const_0;
- // Calculate x bar
- const auto numerator = svsub_f32_z(pg, svld1_f32(pg, input_ptr + x), mean_vec);
- const auto x_bar = svmul_f32_z(pg, numerator, denominator);
- auto res = svmla_f32_z(pg, beta_vec, x_bar, gamma_vec);
+ // Calculate denominator
+ const auto tmp = svadd_f32_z(pg, var_vec, epsilon_vec);
+ auto denominator = svrsqrte_f32(tmp);
+ denominator =
+ svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
+ denominator =
+ svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
- // Perform fused activation
- if(act_info.enabled())
- {
- if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
- {
- res = svmax_f32_z(pg, const_0, res);
- }
- else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
- {
- res = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, res));
- }
- else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ // Calculate x bar
+ const auto numerator = svsub_f32_z(pg, svld1_f32(pg, input_ptr + x), mean_vec);
+ const auto x_bar = svmul_f32_z(pg, numerator, denominator);
+ auto res = svmla_f32_z(pg, beta_vec, x_bar, gamma_vec);
+
+ // Perform fused activation
+ if (act_info.enabled())
{
- res = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, res));
+ if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+ {
+ res = svmax_f32_z(pg, const_0, res);
+ }
+ else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+ {
+ res = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, res));
+ }
+ else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+ {
+ res = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, res));
+ }
}
- }
- // Store results
- svst1_f32(pg, output_ptr + x, res);
+ // Store results
+ svst1_f32(pg, output_ptr + x, res);
- x += svcntw();
- pg = svwhilelt_b32(x, window_end_x);
- }
- while(svptest_any(svptrue_b32(), pg));
- },
- input, output);
+ x += svcntw();
+ pg = svwhilelt_b32(x, window_end_x);
+ } while (svptest_any(svptrue_b32(), pg));
+ },
+ input, output);
}
} // namespace cpu
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/batchnormalization/impl/list.h b/src/core/NEON/kernels/batchnormalization/impl/list.h
index 8e0ea36f5a..cbf540bd71 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/list.h
+++ b/src/core/NEON/kernels/batchnormalization/impl/list.h
@@ -28,9 +28,9 @@ namespace arm_compute
{
namespace cpu
{
-#define DECLARE_BATCH_NORMALIZATION_KERNEL(func_name) \
- void func_name(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, \
- float epsilon, ActivationLayerInfo &act_info, const Window &window)
+#define DECLARE_BATCH_NORMALIZATION_KERNEL(func_name) \
+ void func_name(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, \
+ const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
DECLARE_BATCH_NORMALIZATION_KERNEL(fp16_neon_batch_normalization);
DECLARE_BATCH_NORMALIZATION_KERNEL(fp16_sve_batch_normalization);
diff --git a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
index 3900ea62cd..95cdc8f2f9 100644
--- a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
+++ b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
@@ -25,6 +25,7 @@
#define ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H
#include "arm_compute/function_info/ActivationLayerInfo.h"
+
#include "src/core/NEON/wrapper/wrapper.h"
namespace arm_compute
@@ -158,8 +159,7 @@ struct logistic
*
* @param[in] act_info Activation layer information.
*/
- explicit logistic(ActivationLayerInfo act_info)
- : vone(wrapper::vdup_n(static_cast<T>(1), ExactTagType{}))
+ explicit logistic(ActivationLayerInfo act_info) : vone(wrapper::vdup_n(static_cast<T>(1), ExactTagType{}))
{
ARM_COMPUTE_UNUSED(act_info);
}
@@ -198,8 +198,7 @@ struct relu
*
* @param[in] act_info Activation layer information.
*/
- explicit relu(ActivationLayerInfo act_info)
- : vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{}))
+ explicit relu(ActivationLayerInfo act_info) : vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{}))
{
ARM_COMPUTE_UNUSED(act_info);
}
diff --git a/src/core/NEON/kernels/detail/NEColorConvertHelper.inl b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl
index ac196d9dbb..50fff04cad 100644
--- a/src/core/NEON/kernels/detail/NEColorConvertHelper.inl
+++ b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl
@@ -25,6 +25,7 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/IMultiImage.h"
#include "arm_compute/core/Utils.h"
+
#include "src/core/NEON/NEMath.h"
#include <arm_neon.h>
@@ -50,8 +51,12 @@ constexpr float rgb2u8_red_coef = 0.2126f;
constexpr float rgb2u8_green_coef = 0.7152f;
constexpr float rgb2u8_blue_coef = 0.0722f;
-inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor, const float32x4_t &gcolor, const float32x4_t &bcolor,
- const float rcoef, const float gcoef, const float bcoef)
+inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor,
+ const float32x4_t &gcolor,
+ const float32x4_t &bcolor,
+ const float rcoef,
+ const float gcoef,
+ const float bcoef)
{
float32x4_t greyscale = vmulq_n_f32(rcolor, rcoef);
greyscale = vmlaq_n_f32(greyscale, gcolor, gcoef);
@@ -86,8 +91,12 @@ inline void rgb_to_u8_conversion(const uint8x16x3_t &in, uint8x16_t &out)
arm_compute::convert_float32x4x4_to_uint8x16(out_float32, out);
}
-inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec,
- float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec)
+inline void rgb_to_yuv_calculation(const float32x4_t &rvec,
+ const float32x4_t &gvec,
+ const float32x4_t &bvec,
+ float32x4_t &yvec,
+ float32x4_t &uvec,
+ float32x4_t &vvec)
{
/*
Y'= 0.2126*R' + 0.7152*G' + 0.0722*B'
@@ -110,8 +119,12 @@ inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &g
vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv);
}
-inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val,
- float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha)
+inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val,
+ float32x4_t uvec_val,
+ const float32x4_t &yyvec_val,
+ float32x4_t vvec_val,
+ unsigned char *output_ptr,
+ const bool alpha)
{
float32x4x3_t rgb1, rgb2;
@@ -126,8 +139,7 @@ inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uve
// b = 1.8556f*f_u + 0.0000f*f_v;
const auto red = vmulq_n_f32(vvec_val, red_coef_bt709);
const auto blue = vmulq_n_f32(uvec_val, blue_coef_bt709);
- const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709),
- vmulq_n_f32(vvec_val, green_coef2_bt709));
+ const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709), vmulq_n_f32(vvec_val, green_coef2_bt709));
// Compute the final r,g,b values using y1 for the first texel and y2 for the second one.
// the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t
@@ -144,7 +156,7 @@ inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uve
uint8x8x3_t u8_rgb;
arm_compute::convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb);
- if(!alpha)
+ if (!alpha)
{
vst3_lane_u8(&output_ptr[0], u8_rgb, 0);
vst3_lane_u8(&output_ptr[3], u8_rgb, 4);
@@ -177,7 +189,7 @@ inline uint8x16x3_t load_rgb(const unsigned char *const ptr, const bool alpha)
{
uint8x16x3_t rgb;
- if(alpha)
+ if (alpha)
{
const auto tmp = vld4q_u8(ptr);
rgb.val[0] = tmp.val[0];
@@ -206,12 +218,12 @@ inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_botto
float32x4x4_t fyvec_top, fuvec_top, fvvec_top;
float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom;
- for(auto i = 0; i < 4; ++i)
+ for (auto i = 0; i < 4; ++i)
{
- rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i],
- fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]);
- rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i],
- fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]);
+ rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i], fyvec_top.val[i], fuvec_top.val[i],
+ fvvec_top.val[i]);
+ rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i], fyvec_bottom.val[i],
+ fuvec_bottom.val[i], fvvec_bottom.val[i]);
}
arm_compute::convert_float32x4x4_to_uint8x16(fyvec_top, vec_top.val[0]);
@@ -222,9 +234,14 @@ inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_botto
arm_compute::convert_float32x4x4_to_uint8x16(fvvec_bottom, vec_bottom.val[2]);
}
-inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
- const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
- unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
+inline void store_rgb_to_nv12(const uint8x16_t &rvec_top,
+ const uint8x16_t &gvec_top,
+ const uint8x16_t &bvec_top,
+ const uint8x16_t &rvec_bottom,
+ const uint8x16_t &gvec_bottom,
+ const uint8x16_t &bvec_bottom,
+ unsigned char *const __restrict out_y_top,
+ unsigned char *const __restrict out_y_bottom,
unsigned char *const __restrict out_uv)
{
uint8x16x3_t vec_top, vec_bottom;
@@ -252,9 +269,14 @@ inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec
vst2_u8(out_uv, uvvec);
}
-inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
- const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
- unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
+inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top,
+ const uint8x16_t &gvec_top,
+ const uint8x16_t &bvec_top,
+ const uint8x16_t &rvec_bottom,
+ const uint8x16_t &gvec_bottom,
+ const uint8x16_t &bvec_bottom,
+ unsigned char *const __restrict out_y_top,
+ unsigned char *const __restrict out_y_bottom,
unsigned char *const __restrict out_u,
unsigned char *const __restrict out_v)
{
@@ -273,14 +295,16 @@ inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec
const auto uvvec_top = vuzpq_u8(vec_top.val[1], vec_top.val[2]);
const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]);
- const auto uvvec = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]),
- vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1]));
+ const auto uvvec =
+ vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]), vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1]));
vst1_u8(out_u, vget_low_u8(uvvec));
vst1_u8(out_v, vget_high_u8(uvvec));
}
-inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec,
+inline void store_rgb_to_yuv4(const uint8x16_t &rvec,
+ const uint8x16_t &gvec,
+ const uint8x16_t &bvec,
unsigned char *const __restrict out_y,
unsigned char *const __restrict out_u,
unsigned char *const __restrict out_v)
@@ -291,10 +315,9 @@ inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, co
const float32x4x4_t fbvec = arm_compute::convert_uint8x16_to_float32x4x4(bvec);
float32x4x4_t fyvec, fuvec, fvvec;
- for(auto i = 0; i < 4; ++i)
+ for (auto i = 0; i < 4; ++i)
{
- rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i],
- fyvec.val[i], fuvec.val[i], fvvec.val[i]);
+ rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i], fyvec.val[i], fuvec.val[i], fvvec.val[i]);
}
uint8x16_t yvec, uvec, vvec;
@@ -307,7 +330,7 @@ inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, co
vst1q_u8(out_v, vvec);
}
#endif /* DOXYGEN_SKIP_THIS */
-}
+} // namespace
namespace arm_compute
{
@@ -329,17 +352,19 @@ void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict out
Iterator in(input_ptr, win);
Iterator out(output_ptr, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta1 = vld3q_u8(in.ptr());
- uint8x16x4_t ta2;
- ta2.val[0] = ta1.val[0];
- ta2.val[1] = ta1.val[1];
- ta2.val[2] = ta1.val[2];
- ta2.val[3] = vdupq_n_u8(255);
- vst4q_u8(out.ptr(), ta2);
- },
- in, out);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta1 = vld3q_u8(in.ptr());
+ uint8x16x4_t ta2;
+ ta2.val[0] = ta1.val[0];
+ ta2.val[1] = ta1.val[1];
+ ta2.val[2] = ta1.val[2];
+ ta2.val[3] = vdupq_n_u8(255);
+ vst4q_u8(out.ptr(), ta2);
+ },
+ in, out);
}
/** Convert RGB to U8.
@@ -360,14 +385,16 @@ void colorconvert_rgb_to_u8(const void *__restrict input, void *__restrict outpu
Iterator in(input_ptr, win);
Iterator out(output_ptr, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta1 = vld3q_u8(in.ptr());
- uint8x16_t ta2;
- rgb_to_u8_conversion(ta1, ta2);
- vst1q_u8(out.ptr(), ta2);
- },
- in, out);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta1 = vld3q_u8(in.ptr());
+ uint8x16_t ta2;
+ rgb_to_u8_conversion(ta1, ta2);
+ vst1q_u8(out.ptr(), ta2);
+ },
+ in, out);
}
/** Convert RGBX to RGB.
@@ -388,16 +415,18 @@ void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win
Iterator in(input_ptr, win);
Iterator out(output_ptr, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta1 = vld4q_u8(in.ptr());
- uint8x16x3_t ta2;
- ta2.val[0] = ta1.val[0];
- ta2.val[1] = ta1.val[1];
- ta2.val[2] = ta1.val[2];
- vst3q_u8(out.ptr(), ta2);
- },
- in, out);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta1 = vld4q_u8(in.ptr());
+ uint8x16x3_t ta2;
+ ta2.val[0] = ta1.val[0];
+ ta2.val[1] = ta1.val[1];
+ ta2.val[2] = ta1.val[2];
+ vst3q_u8(out.ptr(), ta2);
+ },
+ in, out);
}
/** Convert YUYV to RGB.
@@ -422,26 +451,32 @@ void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict out
Iterator in(input_ptr, win);
Iterator out(output_ptr, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta = vld4q_u8(in.ptr());
- //ta.val[0] = Y0 Y2 Y4 Y6 ...
- //ta.val[1] = U0 U2 U4 U6 ...
- //ta.val[2] = Y1 Y3 Y5 Y7 ...
- //ta.val[3] = V0 V2 V4 V7 ...
-
- // Convert the uint8x16x4_t to float32x4x4_t
- const float32x4x4_t yvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]);
- const float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]);
- const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]);
- const float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]);
-
- yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
- },
- in, out);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta = vld4q_u8(in.ptr());
+ //ta.val[0] = Y0 Y2 Y4 Y6 ...
+ //ta.val[1] = U0 U2 U4 U6 ...
+ //ta.val[2] = Y1 Y3 Y5 Y7 ...
+ //ta.val[3] = V0 V2 V4 V7 ...
+
+ // Convert the uint8x16x4_t to float32x4x4_t
+ const float32x4x4_t yvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]);
+ const float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]);
+ const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]);
+ const float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]);
+
+ yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size,
+ alpha);
+ yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size,
+ alpha);
+ yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size,
+ alpha);
+ yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size,
+ alpha);
+ },
+ in, out);
}
/** Convert NV12 to RGB.
@@ -475,35 +510,45 @@ void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict out
Iterator in_uv(input_ptr->plane(1), win_uv);
Iterator out(output_ptr, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta_y_top = vld2q_u8(in_y.ptr());
- const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
- const auto ta_uv = vld2q_u8(in_uv.ptr());
- //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
- //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
- //ta_uv.val[0] = U0 U2 U4 U6 ...
- //ta_uv.val[1] = V0 V2 V4 V6 ...
-
- // Convert the uint8x16x4_t to float32x4x4_t
- float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
- float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
- float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
- float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
- float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]);
- float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]);
-
- yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
-
- yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
- },
- in_y, in_uv, out);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta_y_top = vld2q_u8(in_y.ptr());
+ const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+ const auto ta_uv = vld2q_u8(in_uv.ptr());
+ //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+ //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+ //ta_uv.val[0] = U0 U2 U4 U6 ...
+ //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+ // Convert the uint8x16x4_t to float32x4x4_t
+ float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
+ float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
+ float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
+ float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
+ float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]);
+ float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]);
+
+ yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0],
+ out.ptr() + 0 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1],
+ out.ptr() + 1 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2],
+ out.ptr() + 2 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3],
+ out.ptr() + 3 * element_size, alpha);
+
+ yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0],
+ out.ptr() + out_stride + 0 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1],
+ out.ptr() + out_stride + 1 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2],
+ out.ptr() + out_stride + 2 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3],
+ out.ptr() + out_stride + 3 * element_size, alpha);
+ },
+ in_y, in_uv, out);
}
/** Convert IYUV to RGB.
@@ -537,59 +582,71 @@ void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict out
Iterator in_v(input_ptr->plane(2), win_uv);
Iterator out(output_ptr, win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto *y_top_ptr = in_y.ptr();
- const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y();
- const auto *u_ptr = in_u.ptr();
- const auto *v_ptr = in_v.ptr();
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto *y_top_ptr = in_y.ptr();
+ const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y();
+ const auto *u_ptr = in_u.ptr();
+ const auto *v_ptr = in_v.ptr();
// Work-around issue in gcc 9(>=) where vld2q might cause issues with register allocation
#if defined(__arch64__)
- const auto ta0_y_top = vld1q_u8(y_top_ptr);
- const auto ta1_y_top = vld1q_u8(y_top_ptr + 16);
- const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr);
- const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16);
- const auto ta_u = vld1q_u8(u_ptr);
- const auto ta_v = vld1q_u8(v_ptr);
-
- // Convert the uint8x16x4_t to float32x4x4_t
- float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top));
- float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top));
- float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom));
- float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom));
- float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
- float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
+ const auto ta0_y_top = vld1q_u8(y_top_ptr);
+ const auto ta1_y_top = vld1q_u8(y_top_ptr + 16);
+ const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr);
+ const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16);
+ const auto ta_u = vld1q_u8(u_ptr);
+ const auto ta_v = vld1q_u8(v_ptr);
+
+ // Convert the uint8x16x4_t to float32x4x4_t
+ float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top));
+ float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top));
+ float32x4x4_t yvec_bottom =
+ arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom));
+ float32x4x4_t yyvec_bottom =
+ arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom));
+ float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
+ float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
#else /* defined(__arch64__) */
- const auto ta_y_top = vld2q_u8(y_top_ptr);
- const auto ta_y_bottom = vld2q_u8(y_bottom_ptr);
- const auto ta_u = vld1q_u8(u_ptr);
- const auto ta_v = vld1q_u8(v_ptr);
- //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
- //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
- //ta_u.val[0] = U0 U2 U4 U6 ...
- //ta_v.val[0] = V0 V2 V4 V6 ...
-
- // Convert the uint8x16x4_t to float32x4x4_t
- float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
- float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
- float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
- float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
- float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
- float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
+ const auto ta_y_top = vld2q_u8(y_top_ptr);
+ const auto ta_y_bottom = vld2q_u8(y_bottom_ptr);
+ const auto ta_u = vld1q_u8(u_ptr);
+ const auto ta_v = vld1q_u8(v_ptr);
+ //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+ //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+ //ta_u.val[0] = U0 U2 U4 U6 ...
+ //ta_v.val[0] = V0 V2 V4 V6 ...
+
+ // Convert the uint8x16x4_t to float32x4x4_t
+ float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
+ float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
+ float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
+ float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
+ float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
+ float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
#endif /* defined(__arch64__) */
- yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
-
- yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
- yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
- },
- in_y, in_u, in_v, out);
+ yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0],
+ out.ptr() + 0 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1],
+ out.ptr() + 1 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2],
+ out.ptr() + 2 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3],
+ out.ptr() + 3 * element_size, alpha);
+
+ yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0],
+ out.ptr() + out_stride + 0 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1],
+ out.ptr() + out_stride + 1 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2],
+ out.ptr() + out_stride + 2 * element_size, alpha);
+ yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3],
+ out.ptr() + out_stride + 3 * element_size, alpha);
+ },
+ in_y, in_u, in_v, out);
}
/** Convert YUYV to NV12.
@@ -621,31 +678,33 @@ void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict ou
Iterator out_y(output_ptr->plane(0), win);
Iterator out_uv(output_ptr->plane(1), win_uv);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta_top = vld4q_u8(in.ptr());
- const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
- //ta.val[0] = Y0 Y2 Y4 Y6 ...
- //ta.val[1] = U0 U2 U4 U6 ...
- //ta.val[2] = Y1 Y3 Y5 Y7 ...
- //ta.val[3] = V0 V2 V4 V7 ...
-
- uint8x16x2_t yvec;
- yvec.val[0] = ta_top.val[0 + shift];
- yvec.val[1] = ta_top.val[2 + shift];
- vst2q_u8(out_y.ptr(), yvec);
-
- uint8x16x2_t yyvec;
- yyvec.val[0] = ta_bottom.val[0 + shift];
- yyvec.val[1] = ta_bottom.val[2 + shift];
- vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
-
- uint8x16x2_t uvvec;
- uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
- uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
- vst2q_u8(out_uv.ptr(), uvvec);
- },
- in, out_y, out_uv);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta_top = vld4q_u8(in.ptr());
+ const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
+ //ta.val[0] = Y0 Y2 Y4 Y6 ...
+ //ta.val[1] = U0 U2 U4 U6 ...
+ //ta.val[2] = Y1 Y3 Y5 Y7 ...
+ //ta.val[3] = V0 V2 V4 V7 ...
+
+ uint8x16x2_t yvec;
+ yvec.val[0] = ta_top.val[0 + shift];
+ yvec.val[1] = ta_top.val[2 + shift];
+ vst2q_u8(out_y.ptr(), yvec);
+
+ uint8x16x2_t yyvec;
+ yyvec.val[0] = ta_bottom.val[0 + shift];
+ yyvec.val[1] = ta_bottom.val[2 + shift];
+ vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
+
+ uint8x16x2_t uvvec;
+ uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
+ uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
+ vst2q_u8(out_uv.ptr(), uvvec);
+ },
+ in, out_y, out_uv);
}
/** Convert IYUV to NV12.
@@ -676,23 +735,25 @@ void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict ou
Iterator out_y(output_ptr->plane(0), win);
Iterator out_uv(output_ptr->plane(1), win_uv);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta_y_top = vld2q_u8(in_y.ptr());
- const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
- uint8x16x2_t ta_uv;
- ta_uv.val[0] = vld1q_u8(in_u.ptr());
- ta_uv.val[1] = vld1q_u8(in_v.ptr());
- //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
- //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
- //ta_uv.val[0] = U0 U2 U4 U6 ...
- //ta_uv.val[1] = V0 V2 V4 V6 ...
-
- vst2q_u8(out_y.ptr(), ta_y_top);
- vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
- vst2q_u8(out_uv.ptr(), ta_uv);
- },
- in_y, in_u, in_v, out_y, out_uv);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta_y_top = vld2q_u8(in_y.ptr());
+ const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+ uint8x16x2_t ta_uv;
+ ta_uv.val[0] = vld1q_u8(in_u.ptr());
+ ta_uv.val[1] = vld1q_u8(in_v.ptr());
+ //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+ //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+ //ta_uv.val[0] = U0 U2 U4 U6 ...
+ //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+ vst2q_u8(out_y.ptr(), ta_y_top);
+ vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+ vst2q_u8(out_uv.ptr(), ta_uv);
+ },
+ in_y, in_u, in_v, out_y, out_uv);
}
/** Convert NV12 to IYUV.
@@ -726,22 +787,24 @@ void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict ou
Iterator out_u(output_ptr->plane(1), win_uv);
Iterator out_v(output_ptr->plane(2), win_uv);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta_y_top = vld2q_u8(in_y.ptr());
- const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
- const auto ta_uv = vld2q_u8(in_uv.ptr());
- //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
- //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
- //ta_uv.val[0] = U0 U2 U4 U6 ...
- //ta_uv.val[1] = V0 V2 V4 V6 ...
-
- vst2q_u8(out_y.ptr(), ta_y_top);
- vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
- vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]);
- vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]);
- },
- in_y, in_uv, out_y, out_u, out_v);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta_y_top = vld2q_u8(in_y.ptr());
+ const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+ const auto ta_uv = vld2q_u8(in_uv.ptr());
+ //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+ //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+ //ta_uv.val[0] = U0 U2 U4 U6 ...
+ //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+ vst2q_u8(out_y.ptr(), ta_y_top);
+ vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+ vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]);
+ vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]);
+ },
+ in_y, in_uv, out_y, out_u, out_v);
}
/** Convert YUYV to IYUV.
@@ -774,34 +837,36 @@ void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict ou
Iterator out_u(output_ptr->plane(1), win_uv);
Iterator out_v(output_ptr->plane(2), win_uv);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta_top = vld4q_u8(in.ptr());
- const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
- //ta.val[0] = Y0 Y2 Y4 Y6 ...
- //ta.val[1] = U0 U2 U4 U6 ...
- //ta.val[2] = Y1 Y3 Y5 Y7 ...
- //ta.val[3] = V0 V2 V4 V7 ...
-
- uint8x16x2_t yvec;
- yvec.val[0] = ta_top.val[0 + shift];
- yvec.val[1] = ta_top.val[2 + shift];
- vst2q_u8(out_y.ptr(), yvec);
-
- uint8x16x2_t yyvec;
- yyvec.val[0] = ta_bottom.val[0 + shift];
- yyvec.val[1] = ta_bottom.val[2 + shift];
- vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
-
- uint8x16_t uvec;
- uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
- vst1q_u8(out_u.ptr(), uvec);
-
- uint8x16_t vvec;
- vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
- vst1q_u8(out_v.ptr(), vvec);
- },
- in, out_y, out_u, out_v);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta_top = vld4q_u8(in.ptr());
+ const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
+ //ta.val[0] = Y0 Y2 Y4 Y6 ...
+ //ta.val[1] = U0 U2 U4 U6 ...
+ //ta.val[2] = Y1 Y3 Y5 Y7 ...
+ //ta.val[3] = V0 V2 V4 V7 ...
+
+ uint8x16x2_t yvec;
+ yvec.val[0] = ta_top.val[0 + shift];
+ yvec.val[1] = ta_top.val[2 + shift];
+ vst2q_u8(out_y.ptr(), yvec);
+
+ uint8x16x2_t yyvec;
+ yyvec.val[0] = ta_bottom.val[0 + shift];
+ yyvec.val[1] = ta_bottom.val[2 + shift];
+ vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
+
+ uint8x16_t uvec;
+ uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
+ vst1q_u8(out_u.ptr(), uvec);
+
+ uint8x16_t vvec;
+ vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
+ vst1q_u8(out_v.ptr(), vvec);
+ },
+ in, out_y, out_u, out_v);
}
/** Convert NV12 to YUV4.
@@ -835,32 +900,34 @@ void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict ou
Iterator out_u(output_ptr->plane(1), win);
Iterator out_v(output_ptr->plane(2), win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta_y_top = vld2q_u8(in_y.ptr());
- const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
- const auto ta_uv = vld2q_u8(in_uv.ptr());
- //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
- //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
- //ta_uv.val[0] = U0 U2 U4 U6 ...
- //ta_uv.val[1] = V0 V2 V4 V6 ...
-
- vst2q_u8(out_y.ptr(), ta_y_top);
- vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-
- uint8x16x2_t uvec;
- uvec.val[0] = ta_uv.val[0 + shift];
- uvec.val[1] = ta_uv.val[0 + shift];
- vst2q_u8(out_u.ptr(), uvec);
- vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
-
- uint8x16x2_t vvec;
- vvec.val[0] = ta_uv.val[1 - shift];
- vvec.val[1] = ta_uv.val[1 - shift];
- vst2q_u8(out_v.ptr(), vvec);
- vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
- },
- in_y, in_uv, out_y, out_u, out_v);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta_y_top = vld2q_u8(in_y.ptr());
+ const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+ const auto ta_uv = vld2q_u8(in_uv.ptr());
+ //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+ //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+ //ta_uv.val[0] = U0 U2 U4 U6 ...
+ //ta_uv.val[1] = V0 V2 V4 V6 ...
+
+ vst2q_u8(out_y.ptr(), ta_y_top);
+ vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+
+ uint8x16x2_t uvec;
+ uvec.val[0] = ta_uv.val[0 + shift];
+ uvec.val[1] = ta_uv.val[0 + shift];
+ vst2q_u8(out_u.ptr(), uvec);
+ vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
+
+ uint8x16x2_t vvec;
+ vvec.val[0] = ta_uv.val[1 - shift];
+ vvec.val[1] = ta_uv.val[1 - shift];
+ vst2q_u8(out_v.ptr(), vvec);
+ vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
+ },
+ in_y, in_uv, out_y, out_u, out_v);
}
/** Convert IYUV to YUV4.
@@ -892,33 +959,35 @@ void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict ou
Iterator out_u(output_ptr->plane(1), win);
Iterator out_v(output_ptr->plane(2), win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta_y_top = vld2q_u8(in_y.ptr());
- const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
- const auto ta_u = vld1q_u8(in_u.ptr());
- const auto ta_v = vld1q_u8(in_v.ptr());
- //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
- //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
- //ta_u = U0 U2 U4 U6 ...
- //ta_v = V0 V2 V4 V6 ...
-
- vst2q_u8(out_y.ptr(), ta_y_top);
- vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-
- uint8x16x2_t uvec;
- uvec.val[0] = ta_u;
- uvec.val[1] = ta_u;
- vst2q_u8(out_u.ptr(), uvec);
- vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
-
- uint8x16x2_t vvec;
- vvec.val[0] = ta_v;
- vvec.val[1] = ta_v;
- vst2q_u8(out_v.ptr(), vvec);
- vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
- },
- in_y, in_u, in_v, out_y, out_u, out_v);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta_y_top = vld2q_u8(in_y.ptr());
+ const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+ const auto ta_u = vld1q_u8(in_u.ptr());
+ const auto ta_v = vld1q_u8(in_v.ptr());
+ //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+ //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+ //ta_u = U0 U2 U4 U6 ...
+ //ta_v = V0 V2 V4 V6 ...
+
+ vst2q_u8(out_y.ptr(), ta_y_top);
+ vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+
+ uint8x16x2_t uvec;
+ uvec.val[0] = ta_u;
+ uvec.val[1] = ta_u;
+ vst2q_u8(out_u.ptr(), uvec);
+ vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
+
+ uint8x16x2_t vvec;
+ vvec.val[0] = ta_v;
+ vvec.val[1] = ta_v;
+ vst2q_u8(out_v.ptr(), vvec);
+ vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
+ },
+ in_y, in_u, in_v, out_y, out_u, out_v);
}
/** Convert RGB to NV12.
@@ -948,20 +1017,21 @@ void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict out
Iterator out_y(output_ptr->plane(0), win);
Iterator out_uv(output_ptr->plane(1), win_uv);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta_rgb_top = load_rgb(in.ptr(), alpha);
- const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
- //ta_rgb.val[0] = R0 R1 R2 R3 ...
- //ta_rgb.val[1] = G0 G1 G2 G3 ...
- //ta_rgb.val[2] = B0 B1 B2 B3 ...
-
- store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
- ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
- out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
- out_uv.ptr());
- },
- in, out_y, out_uv);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta_rgb_top = load_rgb(in.ptr(), alpha);
+ const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
+ //ta_rgb.val[0] = R0 R1 R2 R3 ...
+ //ta_rgb.val[1] = G0 G1 G2 G3 ...
+ //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+ store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], ta_rgb_bottom.val[0],
+ ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], out_y.ptr(),
+ out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), out_uv.ptr());
+ },
+ in, out_y, out_uv);
}
/** Convert RGB to IYUV.
@@ -992,20 +1062,22 @@ void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict out
Iterator out_u(output_ptr->plane(1), win_uv);
Iterator out_v(output_ptr->plane(2), win_uv);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta_rgb_top = load_rgb(in.ptr(), alpha);
- const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
- //ta_rgb.val[0] = R0 R1 R2 R3 ...
- //ta_rgb.val[1] = G0 G1 G2 G3 ...
- //ta_rgb.val[2] = B0 B1 B2 B3 ...
-
- store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
- ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
- out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
- out_u.ptr(), out_v.ptr());
- },
- in, out_y, out_u, out_v);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta_rgb_top = load_rgb(in.ptr(), alpha);
+ const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
+ //ta_rgb.val[0] = R0 R1 R2 R3 ...
+ //ta_rgb.val[1] = G0 G1 G2 G3 ...
+ //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+ store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], ta_rgb_bottom.val[0],
+ ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], out_y.ptr(),
+ out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), out_u.ptr(),
+ out_v.ptr());
+ },
+ in, out_y, out_u, out_v);
}
/** Convert RGB to YUV4.
@@ -1030,16 +1102,17 @@ void colorconvert_rgb_to_yuv4(const void *__restrict input, void *__restrict out
Iterator out_u(output_ptr->plane(1), win);
Iterator out_v(output_ptr->plane(2), win);
- execute_window_loop(win, [&](const Coordinates &)
- {
- const auto ta_rgb = load_rgb(in.ptr(), alpha);
- //ta_rgb.val[0] = R0 R1 R2 R3 ...
- //ta_rgb.val[1] = G0 G1 G2 G3 ...
- //ta_rgb.val[2] = B0 B1 B2 B3 ...
-
- store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2],
- out_y.ptr(), out_u.ptr(), out_v.ptr());
- },
- in, out_y, out_u, out_v);
+ execute_window_loop(
+ win,
+ [&](const Coordinates &)
+ {
+ const auto ta_rgb = load_rgb(in.ptr(), alpha);
+ //ta_rgb.val[0] = R0 R1 R2 R3 ...
+ //ta_rgb.val[1] = G0 G1 G2 G3 ...
+ //ta_rgb.val[2] = B0 B1 B2 B3 ...
+
+ store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2], out_y.ptr(), out_u.ptr(), out_v.ptr());
+ },
+ in, out_y, out_u, out_v);
}
} // namespace arm_compute
diff --git a/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h
index 96defbc9c9..4b1eb079b2 100644
--- a/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h
+++ b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h
@@ -33,56 +33,32 @@ namespace detail
{
inline float32x4x3_t load_matrix_row(const float *ptr)
{
- const float32x4x3_t r =
- {
- {
- vld1q_dup_f32(ptr),
- vld1q_dup_f32(1 + ptr),
- vld1q_dup_f32(2 + ptr)
- }
- };
+ const float32x4x3_t r = {{vld1q_dup_f32(ptr), vld1q_dup_f32(1 + ptr), vld1q_dup_f32(2 + ptr)}};
return r;
}
template <unsigned int stridex>
-float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2);
+float32x4x2_t convolve_3x3(const float *in_top,
+ const float *in_mid,
+ const float *in_low,
+ const float32x4x3_t &m0,
+ const float32x4x3_t &m1,
+ const float32x4x3_t &m2);
template <>
-inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
+inline float32x4x2_t convolve_3x3<1>(const float *in_top,
+ const float *in_mid,
+ const float *in_low,
+ const float32x4x3_t &m0,
+ const float32x4x3_t &m1,
+ const float32x4x3_t &m2)
{
- const float32x4x3_t vtop =
- {
- {
- vld1q_f32(in_top),
- vld1q_f32(in_top + 4),
- vld1q_f32(in_top + 8)
- }
- };
- const float32x4x3_t vmid =
- {
- {
- vld1q_f32(in_mid),
- vld1q_f32(in_mid + 4),
- vld1q_f32(in_mid + 8)
- }
- };
- const float32x4x3_t vlow =
- {
- {
- vld1q_f32(in_low),
- vld1q_f32(in_low + 4),
- vld1q_f32(in_low + 8)
- }
- };
- float32x4x2_t out =
- {
- {
- vmulq_f32(vtop.val[0], m0.val[0]),
- vmulq_f32(vtop.val[1], m0.val[0])
- }
- };
- out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
- out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
+ const float32x4x3_t vtop = {{vld1q_f32(in_top), vld1q_f32(in_top + 4), vld1q_f32(in_top + 8)}};
+ const float32x4x3_t vmid = {{vld1q_f32(in_mid), vld1q_f32(in_mid + 4), vld1q_f32(in_mid + 8)}};
+ const float32x4x3_t vlow = {{vld1q_f32(in_low), vld1q_f32(in_low + 4), vld1q_f32(in_low + 8)}};
+ float32x4x2_t out = {{vmulq_f32(vtop.val[0], m0.val[0]), vmulq_f32(vtop.val[1], m0.val[0])}};
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
+ out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
@@ -106,7 +82,12 @@ inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, c
}
template <>
-inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
+inline float32x4x2_t convolve_3x3<2>(const float *in_top,
+ const float *in_mid,
+ const float *in_low,
+ const float32x4x3_t &m0,
+ const float32x4x3_t &m1,
+ const float32x4x3_t &m2)
{
float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
@@ -116,7 +97,12 @@ inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, c
}
template <>
-inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
+inline float32x4x2_t convolve_3x3<3>(const float *in_top,
+ const float *in_mid,
+ const float *in_low,
+ const float32x4x3_t &m0,
+ const float32x4x3_t &m1,
+ const float32x4x3_t &m2)
{
float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
@@ -165,6 +151,6 @@ int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteratio
{
return num_elems_written_per_iteration * 3;
}
-}
+} // namespace detail
} // namespace arm_compute
-#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */ \ No newline at end of file
+#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */
diff --git a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
index 7ba52a16b7..fd1ee54597 100644
--- a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
+++ b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
@@ -45,14 +45,7 @@ namespace detail
inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0)
{
ARM_COMPUTE_UNUSED(weights_offset);
- const float32x4x3_t r =
- {
- {
- vld1q_dup_f32(ptr),
- vld1q_dup_f32(1 + ptr),
- vld1q_dup_f32(2 + ptr)
- }
- };
+ const float32x4x3_t r = {{vld1q_dup_f32(ptr), vld1q_dup_f32(1 + ptr), vld1q_dup_f32(2 + ptr)}};
return r;
}
@@ -63,21 +56,16 @@ inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0)
*
* @return The loaded matrix.
*/
-template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)>
inline int32x4x3_t load_matrix_row(const T *ptr, int weights_offset = 0)
{
const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset);
/* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
- int32x4x3_t r =
- {
- {
- vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)),
- vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))),
- vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))
- }
- };
+ int32x4x3_t r = {{vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)),
+ vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))),
+ vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))}};
return r;
}
@@ -245,36 +233,23 @@ inline void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values
* @param[in] input_offset (Optional) Input quantization offset.
*
*/
-inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low,
- const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
- const size_t dilation_x, int input_offset)
+inline float32x4_t single_convolve_3x3_dilation(const float *in_top,
+ const float *in_mid,
+ const float *in_low,
+ const float32x4x3_t &m0,
+ const float32x4x3_t &m1,
+ const float32x4x3_t &m2,
+ const size_t dilation_x,
+ int input_offset)
{
ARM_COMPUTE_UNUSED(input_offset);
- const float32x4x3_t vtop =
- {
- {
- vld1q_f32(in_top),
- vld1q_f32(in_top + dilation_x),
- vld1q_f32(in_top + 2 * dilation_x)
- }
- };
- const float32x4x3_t vmid =
- {
- {
- vld1q_f32(in_mid),
- vld1q_f32(in_mid + dilation_x),
- vld1q_f32(in_mid + 2 * dilation_x)
- }
- };
- const float32x4x3_t vlow =
- {
- {
- vld1q_f32(in_low),
- vld1q_f32(in_low + dilation_x),
- vld1q_f32(in_low + 2 * dilation_x)
- }
- };
+ const float32x4x3_t vtop = {
+ {vld1q_f32(in_top), vld1q_f32(in_top + dilation_x), vld1q_f32(in_top + 2 * dilation_x)}};
+ const float32x4x3_t vmid = {
+ {vld1q_f32(in_mid), vld1q_f32(in_mid + dilation_x), vld1q_f32(in_mid + 2 * dilation_x)}};
+ const float32x4x3_t vlow = {
+ {vld1q_f32(in_low), vld1q_f32(in_low + dilation_x), vld1q_f32(in_low + 2 * dilation_x)}};
float32x4_t out = vmulq_f32(vtop.val[0], m0.val[0]);
out = vmlaq_f32(out, vtop.val[1], m0.val[1]);
out = vmlaq_f32(out, vtop.val[2], m0.val[2]);
@@ -303,26 +278,28 @@ inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float
* @param[in] input_offset (Optional) Input quantization offset.
*
*/
-inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low,
- const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
- const size_t dilation_x, unsigned int stridex, int input_offset = 0)
+inline float32x4x2_t convolve_3x3_dilation(const float *in_top,
+ const float *in_mid,
+ const float *in_low,
+ const float32x4x3_t &m0,
+ const float32x4x3_t &m1,
+ const float32x4x3_t &m2,
+ const size_t dilation_x,
+ unsigned int stridex,
+ int input_offset = 0)
{
ARM_COMPUTE_ERROR_ON(stridex > 3);
- float32x4x2_t out =
- {
- {
- single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
- single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
- }
- };
+ float32x4x2_t out = {
+ {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+ single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)}};
- if(stridex == 2)
+ if (stridex == 2)
{
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
}
- else if(stridex == 3)
+ else if (stridex == 3)
{
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
}
@@ -344,26 +321,32 @@ inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_
*
*/
template <bool accumulate>
-void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr,
- const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
- unsigned int stridex, int input_offset = 0);
+void convolve_3x3(const float *in_top,
+ const float *in_mid,
+ const float *in_low,
+ float *out_ptr,
+ const float32x4x3_t &m0,
+ const float32x4x3_t &m1,
+ const float32x4x3_t &m2,
+ unsigned int stridex,
+ int input_offset = 0);
template <bool accumulate>
-inline void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr,
- const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
- unsigned int stridex, int input_offset)
+inline void convolve_3x3(const float *in_top,
+ const float *in_mid,
+ const float *in_low,
+ float *out_ptr,
+ const float32x4x3_t &m0,
+ const float32x4x3_t &m1,
+ const float32x4x3_t &m2,
+ unsigned int stridex,
+ int input_offset)
{
ARM_COMPUTE_UNUSED(input_offset);
ARM_COMPUTE_ERROR_ON(stridex > 3);
- float32x4x2_t out =
- {
- {
- vdupq_n_f32(0.f),
- vdupq_n_f32(0.f)
- }
- };
- if(stridex == 2)
+ float32x4x2_t out = {{vdupq_n_f32(0.f), vdupq_n_f32(0.f)}};
+ if (stridex == 2)
{
const float32x4x2_t vtop = vld2q_f32(in_top);
const float32x4x2_t vmid = vld2q_f32(in_mid);
@@ -389,32 +372,11 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float *
}
else
{
- const float32x4x3_t vtop =
- {
- {
- vld1q_f32(in_top),
- vld1q_f32(in_top + 4),
- vld1q_f32(in_top + 8)
- }
- };
- const float32x4x3_t vmid =
- {
- {
- vld1q_f32(in_mid),
- vld1q_f32(in_mid + 4),
- vld1q_f32(in_mid + 8)
- }
- };
- const float32x4x3_t vlow =
- {
- {
- vld1q_f32(in_low),
- vld1q_f32(in_low + 4),
- vld1q_f32(in_low + 8)
- }
- };
- out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
- out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]);
+ const float32x4x3_t vtop = {{vld1q_f32(in_top), vld1q_f32(in_top + 4), vld1q_f32(in_top + 8)}};
+ const float32x4x3_t vmid = {{vld1q_f32(in_mid), vld1q_f32(in_mid + 4), vld1q_f32(in_mid + 8)}};
+ const float32x4x3_t vlow = {{vld1q_f32(in_low), vld1q_f32(in_low + 4), vld1q_f32(in_low + 8)}};
+ out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
+ out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]);
out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
@@ -438,7 +400,7 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float *
out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
- if(stridex == 3)
+ if (stridex == 3)
{
out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
@@ -462,65 +424,43 @@ inline void convolve_3x3(const float *in_top, const float *in_mid, const float *
* @param[in] input_offset Input quantization offset.
*
*/
-template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
-inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low,
- const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
- size_t dilation_x, int32_t input_offset)
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)>
+inline int32x4_t single_convolve_3x3_dilation(const T *in_top,
+ const T *in_mid,
+ const T *in_low,
+ const int32x4x3_t &m0,
+ const int32x4x3_t &m1,
+ const int32x4x3_t &m2,
+ size_t dilation_x,
+ int32_t input_offset)
{
using VectorType = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x3_t, int8x8x3_t>::type;
using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
- const VectorType vtop =
- {
- {
- wrapper::vload(in_top),
- wrapper::vload(in_top + dilation_x),
- wrapper::vload(in_top + 2 * dilation_x)
- }
- };
- const VectorType vmid =
- {
- {
- wrapper::vload(in_mid),
- wrapper::vload(in_mid + dilation_x),
- wrapper::vload(in_mid + 2 * dilation_x)
- }
- };
- const VectorType vlow =
- {
- {
- wrapper::vload(in_low),
- wrapper::vload(in_low + dilation_x),
- wrapper::vload(in_low + 2 * dilation_x)
- }
- };
-
- const int32x4x3_t vtop_s32 =
- {
- {
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))),
- }
- };
- const int32x4x3_t vmid_s32 =
- {
- {
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))),
- }
- };
- const int32x4x3_t vlow_s32 =
- {
- {
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))),
- }
- };
+ const VectorType vtop = {
+ {wrapper::vload(in_top), wrapper::vload(in_top + dilation_x), wrapper::vload(in_top + 2 * dilation_x)}};
+ const VectorType vmid = {
+ {wrapper::vload(in_mid), wrapper::vload(in_mid + dilation_x), wrapper::vload(in_mid + 2 * dilation_x)}};
+ const VectorType vlow = {
+ {wrapper::vload(in_low), wrapper::vload(in_low + dilation_x), wrapper::vload(in_low + 2 * dilation_x)}};
+
+ const int32x4x3_t vtop_s32 = {{
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))),
+ }};
+ const int32x4x3_t vmid_s32 = {{
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))),
+ }};
+ const int32x4x3_t vlow_s32 = {{
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))),
+ }};
int32x4_t out = wrapper::vmul(vtop_s32.val[0], m0.val[0]);
out = wrapper::vmla(out, vtop_s32.val[1], m0.val[1]);
@@ -550,26 +490,29 @@ inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid,
* @param[in] input_offset Input quantization offset.
*
*/
-template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
-inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
- const size_t dilation_x, unsigned int stridex, int input_offset)
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)>
+inline int32x4x2_t convolve_3x3_dilation(const T *in_top,
+ const T *in_mid,
+ const T *in_low,
+ const int32x4x3_t &m0,
+ const int32x4x3_t &m1,
+ const int32x4x3_t &m2,
+ const size_t dilation_x,
+ unsigned int stridex,
+ int input_offset)
{
ARM_COMPUTE_ERROR_ON(stridex > 3);
- int32x4x2_t out =
- {
- {
- single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
- single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
- }
- };
+ int32x4x2_t out = {
+ {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+ single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)}};
- if(stridex == 2)
+ if (stridex == 2)
{
out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);
}
- else if(stridex == 3)
+ else if (stridex == 3)
{
out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
}
@@ -589,10 +532,19 @@ inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const
* @param[in] input_offset Input quantization offset.
*
*/
-template < bool accumulate, typename T1, typename T2, ARM_COMPUTE_REQUIRES_TA(std::is_same<T1, uint8_t>::value || std::is_same<T1, int8_t>::value) >
-void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ptr,
- const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
- unsigned int stridex, int32_t input_offset)
+template <bool accumulate,
+ typename T1,
+ typename T2,
+ ARM_COMPUTE_REQUIRES_TA(std::is_same<T1, uint8_t>::value || std::is_same<T1, int8_t>::value)>
+void convolve_3x3(const T1 *in_top,
+ const T1 *in_mid,
+ const T1 *in_low,
+ T2 *out_ptr,
+ const int32x4x3_t &m0,
+ const int32x4x3_t &m1,
+ const int32x4x3_t &m2,
+ unsigned int stridex,
+ int32_t input_offset)
{
ARM_COMPUTE_ERROR_ON(stridex > 3);
using VectorType = typename std::conditional<std::is_same<T1, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
@@ -600,60 +552,30 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_
const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
- const VectorType vtop =
- {
- {
- wrapper::vload(in_top),
- wrapper::vload(in_top + 8)
- }
- };
- const VectorType vmid =
- {
- {
- wrapper::vload(in_mid),
- wrapper::vload(in_mid + 8)
- }
- };
- const VectorType vlow =
- {
- {
- wrapper::vload(in_low),
- wrapper::vload(in_low + 8)
- }
- };
-
- const int32x4x3_t vtop_s32 =
- {
- {
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
- }
- };
- const int32x4x3_t vmid_s32 =
- {
- {
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
- }
- };
- const int32x4x3_t vlow_s32 =
- {
- {
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))),
- wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
- }
- };
-
- int32x4x2_t out
- {
- {
- wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
- wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
- }
- };
+ const VectorType vtop = {{wrapper::vload(in_top), wrapper::vload(in_top + 8)}};
+ const VectorType vmid = {{wrapper::vload(in_mid), wrapper::vload(in_mid + 8)}};
+ const VectorType vlow = {{wrapper::vload(in_low), wrapper::vload(in_low + 8)}};
+
+ const int32x4x3_t vtop_s32 = {{
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
+ }};
+ const int32x4x3_t vmid_s32 = {{
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
+ }};
+ const int32x4x3_t vlow_s32 = {{
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))),
+ wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
+ }};
+
+ int32x4x2_t out{{
+ wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
+ wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
+ }};
// 0
out.val[0] = wrapper::vmla(out.val[0], vtop_s32.val[0], m0.val[0]);
@@ -681,11 +603,11 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_
out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vlow_s32.val[1], vlow_s32.val[2]), m2.val[1]);
out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vlow_s32.val[1], vlow_s32.val[2]), m2.val[2]);
- if(stridex == 1)
+ if (stridex == 1)
{
accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
}
- else if(stridex == 2)
+ else if (stridex == 2)
{
out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
@@ -693,7 +615,7 @@ void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_
accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
}
- else if(stridex == 3)
+ else if (stridex == 3)
{
out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
@@ -712,14 +634,7 @@ inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset =
ARM_COMPUTE_UNUSED(weights_offset);
/* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
- const float16x8x3_t r =
- {
- {
- vld1q_dup_f16(ptr),
- vld1q_dup_f16(1 + ptr),
- vld1q_dup_f16(2 + ptr)
- }
- };
+ const float16x8x3_t r = {{vld1q_dup_f16(ptr), vld1q_dup_f16(1 + ptr), vld1q_dup_f16(2 + ptr)}};
return r;
}
@@ -735,35 +650,22 @@ inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset =
* @param[in] input_offset (Optional)Input quantization offset.
*
*/
-inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
- const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
- const size_t dilation_x, int input_offset = 0)
+inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top,
+ const float16_t *in_mid,
+ const float16_t *in_low,
+ const float16x8x3_t &m0,
+ const float16x8x3_t &m1,
+ const float16x8x3_t &m2,
+ const size_t dilation_x,
+ int input_offset = 0)
{
ARM_COMPUTE_UNUSED(input_offset);
- const float16x8x3_t vtop =
- {
- {
- vld1q_f16(in_top),
- vld1q_f16(in_top + dilation_x),
- vld1q_f16(in_top + 2 * dilation_x)
- }
- };
- const float16x8x3_t vmid =
- {
- {
- vld1q_f16(in_mid),
- vld1q_f16(in_mid + dilation_x),
- vld1q_f16(in_mid + 2 * dilation_x)
- }
- };
- const float16x8x3_t vlow =
- {
- {
- vld1q_f16(in_low),
- vld1q_f16(in_low + dilation_x),
- vld1q_f16(in_low + 2 * dilation_x)
- }
- };
+ const float16x8x3_t vtop = {
+ {vld1q_f16(in_top), vld1q_f16(in_top + dilation_x), vld1q_f16(in_top + 2 * dilation_x)}};
+ const float16x8x3_t vmid = {
+ {vld1q_f16(in_mid), vld1q_f16(in_mid + dilation_x), vld1q_f16(in_mid + 2 * dilation_x)}};
+ const float16x8x3_t vlow = {
+ {vld1q_f16(in_low), vld1q_f16(in_low + dilation_x), vld1q_f16(in_low + 2 * dilation_x)}};
float16x8_t out = vmulq_f16(vtop.val[0], m0.val[0]);
out = vaddq_f16(out, vmulq_f16(vtop.val[1], m0.val[1]));
out = vaddq_f16(out, vmulq_f16(vtop.val[2], m0.val[2]));
@@ -792,19 +694,21 @@ inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const f
* @param[in] input_offset (Optional) Input quantization offset.
*
*/
-inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
- const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
- const size_t dilation_x, unsigned int stridex, int input_offset = 0)
-{
- float16x8x2_t out =
- {
- {
- single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
- single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset)
- }
- };
-
- if(stridex == 2)
+inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top,
+ const float16_t *in_mid,
+ const float16_t *in_low,
+ const float16x8x3_t &m0,
+ const float16x8x3_t &m1,
+ const float16x8x3_t &m2,
+ const size_t dilation_x,
+ unsigned int stridex,
+ int input_offset = 0)
+{
+ float16x8x2_t out = {
+ {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+ single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset)}};
+
+ if (stridex == 2)
{
out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1);
out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 4), out.val[0], 2);
@@ -814,7 +718,7 @@ inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float1
out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 4), out.val[0], 6);
out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 6), out.val[0], 7);
}
- else if(stridex == 3)
+ else if (stridex == 3)
{
out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2);
@@ -838,20 +742,20 @@ inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float1
*
*/
template <bool accumulate>
-inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, float16_t *out_ptr,
- const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
- unsigned int stridex, int input_offset = 0)
+inline void convolve_3x3(const float16_t *in_top,
+ const float16_t *in_mid,
+ const float16_t *in_low,
+ float16_t *out_ptr,
+ const float16x8x3_t &m0,
+ const float16x8x3_t &m1,
+ const float16x8x3_t &m2,
+ unsigned int stridex,
+ int input_offset = 0)
{
ARM_COMPUTE_UNUSED(input_offset);
- float16x8x2_t out =
- {
- {
- vdupq_n_f16(0),
- vdupq_n_f16(0)
- }
- };
- if(stridex == 2)
+ float16x8x2_t out = {{vdupq_n_f16(0), vdupq_n_f16(0)}};
+ if (stridex == 2)
{
const float16x8x2_t vtop = vld2q_f16(in_top);
const float16x8x2_t vmid = vld2q_f16(in_mid);
@@ -877,32 +781,11 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const
}
else
{
- const float16x8x3_t vtop =
- {
- {
- vld1q_f16(in_top),
- vld1q_f16(in_top + 8),
- vld1q_f16(in_top + 16)
- }
- };
- const float16x8x3_t vmid =
- {
- {
- vld1q_f16(in_mid),
- vld1q_f16(in_mid + 8),
- vld1q_f16(in_mid + 16)
- }
- };
- const float16x8x3_t vlow =
- {
- {
- vld1q_f16(in_low),
- vld1q_f16(in_low + 8),
- vld1q_f16(in_low + 16)
- }
- };
- out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]);
- out.val[1] = vmulq_f16(vtop.val[1], m0.val[0]);
+ const float16x8x3_t vtop = {{vld1q_f16(in_top), vld1q_f16(in_top + 8), vld1q_f16(in_top + 16)}};
+ const float16x8x3_t vmid = {{vld1q_f16(in_mid), vld1q_f16(in_mid + 8), vld1q_f16(in_mid + 16)}};
+ const float16x8x3_t vlow = {{vld1q_f16(in_low), vld1q_f16(in_low + 8), vld1q_f16(in_low + 16)}};
+ out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]);
+ out.val[1] = vmulq_f16(vtop.val[1], m0.val[0]);
out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1]));
out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2]));
@@ -921,7 +804,7 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const
out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1]));
out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2]));
- if(stridex == 3)
+ if (stridex == 3)
{
out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2);
@@ -946,7 +829,7 @@ inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const
*/
inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration, unsigned int stridex)
{
- switch(stridex)
+ switch (stridex)
{
case 1:
return num_elems_written_per_iteration;
@@ -959,6 +842,6 @@ inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iter
return 0;
}
}
-}
+} // namespace detail
} // namespace arm_compute
#endif /* ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H */