From afd38f0c617d6f89b2b4532c6c44f116617e2b6f Mon Sep 17 00:00:00 2001 From: Felix Thomasmathibalan Date: Wed, 27 Sep 2023 17:46:17 +0100 Subject: Apply clang-format on repository Code is formatted as per a revised clang format configuration file(not part of this delivery). Version 14.0.6 is used. Exclusion List: - files with .cl extension - files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...) And the following directories - compute_kernel_writer/validation/ - tests/ - include/ - src/core/NEON/kernels/convolution/ - src/core/NEON/kernels/arm_gemm/ - src/core/NEON/kernels/arm_conv/ - data/ There will be a follow up for formatting of .cl files and the files under tests/ and compute_kernel_writer/validation/. Signed-off-by: Felix Thomasmathibalan Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391 Benchmark: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gunes Bayir --- src/runtime/NEON/functions/NEActivationLayer.cpp | 17 +- src/runtime/NEON/functions/NEAddMulAdd.cpp | 44 +- src/runtime/NEON/functions/NEArgMinMaxLayer.cpp | 15 +- .../NEON/functions/NEArithmeticAddition.cpp | 26 +- .../NEON/functions/NEArithmeticSubtraction.cpp | 26 +- .../NEON/functions/NEBatchNormalizationLayer.cpp | 25 +- src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp | 13 +- src/runtime/NEON/functions/NEBitwiseAnd.cpp | 3 +- src/runtime/NEON/functions/NEBitwiseNot.cpp | 3 +- src/runtime/NEON/functions/NEBitwiseOr.cpp | 3 +- src/runtime/NEON/functions/NEBitwiseXor.cpp | 3 +- .../NEON/functions/NEBoundingBoxTransform.cpp | 11 +- src/runtime/NEON/functions/NECast.cpp | 14 +- .../NEON/functions/NEChannelShuffleLayer.cpp | 1 + src/runtime/NEON/functions/NEConcatenateLayer.cpp | 30 +- src/runtime/NEON/functions/NEConv3D.cpp | 27 +- .../functions/NEConvertFullyConnectedWeights.cpp | 24 +- src/runtime/NEON/functions/NEConvolutionLayer.cpp | 84 +- src/runtime/NEON/functions/NECopy.cpp | 12 +- src/runtime/NEON/functions/NECropResize.cpp | 54 +- .../NEON/functions/NEDeconvolutionLayer.cpp | 116 ++- src/runtime/NEON/functions/NEDepthConvertLayer.cpp | 17 +- src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp | 1 + .../NEON/functions/NEDepthwiseConvolutionLayer.cpp | 193 ++-- .../NEON/functions/NEDequantizationLayer.cpp | 10 +- .../NEON/functions/NEDetectionPostProcessLayer.cpp | 60 +- .../NEON/functions/NEDirectConvolutionLayer.cpp | 27 +- .../NEON/functions/NEElementwiseOperations.cpp | 152 +-- .../NEON/functions/NEElementwiseUnaryLayer.cpp | 15 +- src/runtime/NEON/functions/NEFFT1D.cpp | 29 +- src/runtime/NEON/functions/NEFFT2D.cpp | 8 +- .../NEON/functions/NEFFTConvolutionLayer.cpp | 105 +- src/runtime/NEON/functions/NEFill.cpp | 10 +- src/runtime/NEON/functions/NEFillBorder.cpp | 9 +- src/runtime/NEON/functions/NEFlattenLayer.cpp | 22 +- src/runtime/NEON/functions/NEFloor.cpp | 12 +- .../NEON/functions/NEFullyConnectedLayer.cpp | 77 +- .../NEON/functions/NEFuseBatchNormalization.cpp | 42 +- src/runtime/NEON/functions/NEGEMM.cpp | 62 +- src/runtime/NEON/functions/NEGEMMConv2d.cpp | 39 +- .../NEON/functions/NEGEMMConvolutionLayer.cpp | 76 +- .../functions/NEGEMMLowpMatrixMultiplyCore.cpp | 61 +- .../NEON/functions/NEGEMMLowpOutputStage.cpp | 33 +- src/runtime/NEON/functions/NEGather.cpp | 3 +- .../NEON/functions/NEGenerateProposalsLayer.cpp | 187 ++-- .../functions/NEInstanceNormalizationLayer.cpp | 26 +- src/runtime/NEON/functions/NEL2NormalizeLayer.cpp | 4 +- src/runtime/NEON/functions/NELSTMLayer.cpp | 510 +++++---- .../NEON/functions/NELSTMLayerQuantized.cpp | 383 +++++-- src/runtime/NEON/functions/NELogical.cpp | 12 +- src/runtime/NEON/functions/NEMatMul.cpp | 28 +- src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp | 24 +- .../functions/NEMeanStdDevNormalizationLayer.cpp | 3 +- .../NEON/functions/NENormalizationLayer.cpp | 10 +- src/runtime/NEON/functions/NEPReluLayer.cpp | 14 +- src/runtime/NEON/functions/NEPadLayer.cpp | 90 +- src/runtime/NEON/functions/NEPermute.cpp | 10 +- .../NEON/functions/NEPixelWiseMultiplication.cpp | 50 +- src/runtime/NEON/functions/NEPooling3dLayer.cpp | 17 +- src/runtime/NEON/functions/NEPoolingLayer.cpp | 21 +- src/runtime/NEON/functions/NEPriorBoxLayer.cpp | 13 +- src/runtime/NEON/functions/NEQLSTMLayer.cpp | 1082 ++++++++++++-------- src/runtime/NEON/functions/NEQuantizationLayer.cpp | 10 +- src/runtime/NEON/functions/NERNNLayer.cpp | 44 +- src/runtime/NEON/functions/NEROIAlignLayer.cpp | 10 +- src/runtime/NEON/functions/NEROIPoolingLayer.cpp | 17 +- src/runtime/NEON/functions/NERange.cpp | 6 +- src/runtime/NEON/functions/NEReduceMean.cpp | 55 +- .../NEON/functions/NEReductionOperation.cpp | 75 +- src/runtime/NEON/functions/NEReorderLayer.cpp | 19 +- src/runtime/NEON/functions/NEReorgLayer.cpp | 3 +- src/runtime/NEON/functions/NEReshapeLayer.cpp | 12 +- src/runtime/NEON/functions/NEReverse.cpp | 8 +- src/runtime/NEON/functions/NEScale.cpp | 48 +- src/runtime/NEON/functions/NESelect.cpp | 1 + src/runtime/NEON/functions/NESlice.cpp | 35 +- src/runtime/NEON/functions/NESoftmaxLayer.cpp | 21 +- src/runtime/NEON/functions/NESpaceToBatchLayer.cpp | 38 +- src/runtime/NEON/functions/NESpaceToDepthLayer.cpp | 4 +- src/runtime/NEON/functions/NESplit.cpp | 2 +- src/runtime/NEON/functions/NEStackLayer.cpp | 11 +- src/runtime/NEON/functions/NEStridedSlice.cpp | 59 +- src/runtime/NEON/functions/NETile.cpp | 3 +- src/runtime/NEON/functions/NETranspose.cpp | 10 +- src/runtime/NEON/functions/NEUnstack.cpp | 34 +- .../NEON/functions/NEWinogradConvolutionLayer.cpp | 44 +- 86 files changed, 2933 insertions(+), 1734 deletions(-) (limited to 'src/runtime/NEON/functions') diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp index e48aede590..59199452ce 100644 --- a/src/runtime/NEON/functions/NEActivationLayer.cpp +++ b/src/runtime/NEON/functions/NEActivationLayer.cpp @@ -24,24 +24,24 @@ #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuActivation.h" namespace arm_compute { struct NEActivationLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - IRuntimeContext *ctx{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + IRuntimeContext *ctx{nullptr}; + std::unique_ptr op{nullptr}; }; -NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx) - : _impl(std::make_unique()) +NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx) : _impl(std::make_unique()) { _impl->ctx = ctx; } -NEActivationLayer::NEActivationLayer(NEActivationLayer &&) = default; +NEActivationLayer::NEActivationLayer(NEActivationLayer &&) = default; NEActivationLayer &NEActivationLayer::operator=(NEActivationLayer &&) = default; NEActivationLayer::~NEActivationLayer() = default; @@ -56,7 +56,8 @@ void NEActivationLayer::configure(ITensor *input, ITensor *output, ActivationLay _impl->op->configure(_impl->src->info(), _impl->dst->info(), activation_info); } -Status NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status +NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info) { return cpu::CpuActivation::validate(input, output, act_info); } diff --git a/src/runtime/NEON/functions/NEAddMulAdd.cpp b/src/runtime/NEON/functions/NEAddMulAdd.cpp index cfeaefc4fd..a72364791c 100644 --- a/src/runtime/NEON/functions/NEAddMulAdd.cpp +++ b/src/runtime/NEON/functions/NEAddMulAdd.cpp @@ -25,6 +25,7 @@ #include "arm_compute/runtime/NEON/functions/NEAddMulAdd.h" #include "arm_compute/runtime/Tensor.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuAddMulAdd.h" @@ -33,45 +34,50 @@ namespace arm_compute { struct NEAddMulAdd::Impl { - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; WorkspaceData workspace_tensors{}; ITensorPack run_pack{}; MemoryGroup memory_group{}; }; -NEAddMulAdd::NEAddMulAdd(std::shared_ptr memory_manager) - : _impl(std::make_unique()) +NEAddMulAdd::NEAddMulAdd(std::shared_ptr memory_manager) : _impl(std::make_unique()) { _impl->memory_group = MemoryGroup(std::move(memory_manager)); } NEAddMulAdd::~NEAddMulAdd() = default; -void NEAddMulAdd::configure(ITensor *input1, ITensor *input2, ITensor *bn_mul, ITensor *bn_add, ITensor *add_output, - ITensor *final_output, const ConvertPolicy policy, const ActivationLayerInfo &act_info) +void NEAddMulAdd::configure(ITensor *input1, + ITensor *input2, + ITensor *bn_mul, + ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + const ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info); - _impl->op = std::make_unique(); - _impl->op->configure(input1->info(), input2->info(), bn_mul->info(), - bn_add->info(), add_output != nullptr ? add_output->info() : nullptr, final_output->info(), policy, act_info); + _impl->op = std::make_unique(); + _impl->op->configure(input1->info(), input2->info(), bn_mul->info(), bn_add->info(), + add_output != nullptr ? add_output->info() : nullptr, final_output->info(), policy, act_info); - _impl->run_pack = - { - { TensorType::ACL_SRC_0, input1 }, - { TensorType::ACL_SRC_1, input2 }, - { TensorType::ACL_SRC_2, bn_mul }, - { TensorType::ACL_SRC_3, bn_add }, - { TensorType::ACL_DST_0, add_output }, - { TensorType::ACL_DST_1, final_output }, + _impl->run_pack = { + {TensorType::ACL_SRC_0, input1}, {TensorType::ACL_SRC_1, input2}, {TensorType::ACL_SRC_2, bn_mul}, + {TensorType::ACL_SRC_3, bn_add}, {TensorType::ACL_DST_0, add_output}, {TensorType::ACL_DST_1, final_output}, }; _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } -Status NEAddMulAdd::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *bn_mul, - const ITensorInfo *bn_add, const ITensorInfo *add_output, const ITensorInfo *final_output, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status NEAddMulAdd::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return cpu::CpuAddMulAdd::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info); } diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp index 3ac127b02e..fbaf1a96e7 100644 --- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp +++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp @@ -32,6 +32,7 @@ #include "arm_compute/runtime/NEON/functions/NECast.h" #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" #include "arm_compute/runtime/Tensor.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEReductionOperationKernel.h" @@ -48,8 +49,7 @@ struct NEArgMinMaxLayer::Impl NEArgMinMaxLayer::~NEArgMinMaxLayer() = default; -NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr memory_manager) - : _impl(std::make_unique()) +NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr memory_manager) : _impl(std::make_unique()) { _impl->memory_manager = std::move(memory_manager); } @@ -58,7 +58,8 @@ void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, cons { ARM_COMPUTE_LOG_PARAMS(input, axis, output, op); _impl->reduction_function = std::make_unique(); - if(output->info() && (output->info()->data_type() == DataType::S64 || output->info()->data_type() == DataType::U64)) + if (output->info() && + (output->info()->data_type() == DataType::S64 || output->info()->data_type() == DataType::U64)) { _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager)); _impl->cast_function = std::make_unique(); @@ -74,9 +75,11 @@ void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, cons } } -Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) +Status +NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, + "Invalid operation"); return NEReductionOperation::validate(input, output, axis, op, false); } @@ -84,7 +87,7 @@ void NEArgMinMaxLayer::run() { MemoryGroupResourceScope scope_mg(_impl->memory_group); _impl->reduction_function->run(); - if(_impl->tmp_reduction_result != nullptr) + if (_impl->tmp_reduction_result != nullptr) { _impl->cast_function->run(); } diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp index a7581ca9f4..aff16ae9d1 100644 --- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp +++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuAdd.h" #include @@ -32,26 +33,33 @@ namespace arm_compute { struct NEArithmeticAddition::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEArithmeticAddition::NEArithmeticAddition() - : _impl(std::make_unique()) +NEArithmeticAddition::NEArithmeticAddition() : _impl(std::make_unique()) { } -NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&) = default; +NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&) = default; NEArithmeticAddition &NEArithmeticAddition::operator=(NEArithmeticAddition &&) = default; NEArithmeticAddition::~NEArithmeticAddition() = default; -Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status NEArithmeticAddition::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return cpu::CpuAdd::validate(input1, input2, output, policy, act_info); } -void NEArithmeticAddition::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void NEArithmeticAddition::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp index 6fdd4267bf..097525c1a8 100644 --- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp +++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" #include "arm_compute/core/ITensor.h" + #include "src/cpu/operators/CpuSub.h" #include @@ -32,26 +33,33 @@ namespace arm_compute { struct NEArithmeticSubtraction::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEArithmeticSubtraction::NEArithmeticSubtraction() - : _impl(std::make_unique()) +NEArithmeticSubtraction::NEArithmeticSubtraction() : _impl(std::make_unique()) { } -NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&) = default; +NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&) = default; NEArithmeticSubtraction &NEArithmeticSubtraction::operator=(NEArithmeticSubtraction &&) = default; NEArithmeticSubtraction::~NEArithmeticSubtraction() = default; -Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return cpu::CpuSub::validate(input1, input2, output, policy, act_info); } -void NEArithmeticSubtraction::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info) +void NEArithmeticSubtraction::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp index db49f4c1a0..d491f0aafc 100644 --- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h" @@ -36,12 +37,17 @@ namespace arm_compute { NEBatchNormalizationLayer::~NEBatchNormalizationLayer() = default; -NEBatchNormalizationLayer::NEBatchNormalizationLayer() - : _norm_kernel() +NEBatchNormalizationLayer::NEBatchNormalizationLayer() : _norm_kernel() { } -void NEBatchNormalizationLayer::configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, +void NEBatchNormalizationLayer::configure(ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta, + const ITensor *gamma, + float epsilon, ActivationLayerInfo act_info) { ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info); @@ -50,10 +56,17 @@ void NEBatchNormalizationLayer::configure(ITensor *input, ITensor *output, const _norm_kernel->configure(input, output, mean, var, beta, gamma, epsilon, act_info); } -Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta, const ITensorInfo *gamma, - float epsilon, ActivationLayerInfo act_info) +Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta, + const ITensorInfo *gamma, + float epsilon, + ActivationLayerInfo act_info) { - ARM_COMPUTE_RETURN_ON_ERROR(NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info)); return Status{}; } diff --git a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp index e258028d05..5d711c5ddf 100644 --- a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp +++ b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h" @@ -41,19 +42,25 @@ void NEBatchToSpaceLayer::configure(const ITensor *input, const ITensor *block_s _kernel = std::move(k); } -void NEBatchToSpaceLayer::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info) +void NEBatchToSpaceLayer::configure( + const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info) { auto k = std::make_unique(); k->configure(input, block_shape_x, block_shape_y, output, crop_info); _kernel = std::move(k); } -Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) +Status +NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output) { return NEBatchToSpaceLayerKernel::validate(input, block_shape, output); } -Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info) +Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, + int32_t block_shape_x, + int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info) { return NEBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info); } diff --git a/src/runtime/NEON/functions/NEBitwiseAnd.cpp b/src/runtime/NEON/functions/NEBitwiseAnd.cpp index 90eb72706e..89ce2087be 100644 --- a/src/runtime/NEON/functions/NEBitwiseAnd.cpp +++ b/src/runtime/NEON/functions/NEBitwiseAnd.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h" -#include "src/core/NEON/kernels/NEBitwiseAndKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEBitwiseAndKernel.h" #include diff --git a/src/runtime/NEON/functions/NEBitwiseNot.cpp b/src/runtime/NEON/functions/NEBitwiseNot.cpp index 69e5288b88..eda59cd3e9 100644 --- a/src/runtime/NEON/functions/NEBitwiseNot.cpp +++ b/src/runtime/NEON/functions/NEBitwiseNot.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h" -#include "src/core/NEON/kernels/NEBitwiseNotKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEBitwiseNotKernel.h" #include diff --git a/src/runtime/NEON/functions/NEBitwiseOr.cpp b/src/runtime/NEON/functions/NEBitwiseOr.cpp index 0b19e919ee..3d6f30b0fe 100644 --- a/src/runtime/NEON/functions/NEBitwiseOr.cpp +++ b/src/runtime/NEON/functions/NEBitwiseOr.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h" -#include "src/core/NEON/kernels/NEBitwiseOrKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEBitwiseOrKernel.h" #include diff --git a/src/runtime/NEON/functions/NEBitwiseXor.cpp b/src/runtime/NEON/functions/NEBitwiseXor.cpp index cc9df9f1c4..f0cf3d3e5c 100644 --- a/src/runtime/NEON/functions/NEBitwiseXor.cpp +++ b/src/runtime/NEON/functions/NEBitwiseXor.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h" -#include "src/core/NEON/kernels/NEBitwiseXorKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEBitwiseXorKernel.h" #include diff --git a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp index af00171be6..adf891e417 100644 --- a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp +++ b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp @@ -22,12 +22,16 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEBoundingBoxTransformKernel.h" namespace arm_compute { -void NEBoundingBoxTransform::configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info) +void NEBoundingBoxTransform::configure(const ITensor *boxes, + ITensor *pred_boxes, + const ITensor *deltas, + const BoundingBoxTransformInfo &info) { ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info); // Configure Bounding Box kernel @@ -36,7 +40,10 @@ void NEBoundingBoxTransform::configure(const ITensor *boxes, ITensor *pred_boxes _kernel = std::move(k); } -Status NEBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info) +Status NEBoundingBoxTransform::validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info) { return NEBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info); } diff --git a/src/runtime/NEON/functions/NECast.cpp b/src/runtime/NEON/functions/NECast.cpp index f93a6ea745..1fd172a730 100644 --- a/src/runtime/NEON/functions/NECast.cpp +++ b/src/runtime/NEON/functions/NECast.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NECast.h" #include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include "src/cpu/operators/CpuCast.h" @@ -31,16 +32,15 @@ namespace arm_compute { struct NECast::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NECast::NECast() - : _impl(std::make_unique()) +NECast::NECast() : _impl(std::make_unique()) { } -NECast::NECast(NECast &&) = default; +NECast::NECast(NECast &&) = default; NECast &NECast::operator=(NECast &&) = default; NECast::~NECast() = default; @@ -62,7 +62,7 @@ Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output, Con void NECast::run() { - ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } }; + ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}}; _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp index 8b96fadb74..86bee4dd43 100644 --- a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp +++ b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h" #include "arm_compute/core/Types.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEChannelShuffleLayerKernel.h" diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp index ceb697aad6..59a0892f1f 100644 --- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp +++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp @@ -23,33 +23,31 @@ */ #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h" -#include "src/cpu/operators/CpuConcatenate.h" - -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - #include "arm_compute/core/Error.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/core/helpers/AutoConfiguration.h" +#include "src/cpu/operators/CpuConcatenate.h" namespace arm_compute { struct NEConcatenateLayer::Impl { std::vector srcs{}; - ITensor *dst{ nullptr }; - unsigned int num_inputs{ 0 }; - unsigned int axis{ 0 }; - std::unique_ptr op{ nullptr }; + ITensor *dst{nullptr}; + unsigned int num_inputs{0}; + unsigned int axis{0}; + std::unique_ptr op{nullptr}; }; -NEConcatenateLayer::NEConcatenateLayer() - : _impl(std::make_unique()) +NEConcatenateLayer::NEConcatenateLayer() : _impl(std::make_unique()) { } -NEConcatenateLayer::NEConcatenateLayer(NEConcatenateLayer &&) = default; +NEConcatenateLayer::NEConcatenateLayer(NEConcatenateLayer &&) = default; NEConcatenateLayer &NEConcatenateLayer::operator=(NEConcatenateLayer &&) = default; NEConcatenateLayer::~NEConcatenateLayer() = default; @@ -64,7 +62,7 @@ void NEConcatenateLayer::configure(std::vector inputs_vector, I _impl->op = std::make_unique(); std::vector inputs_vector_info; - for(unsigned int i = 0; i < inputs_vector.size(); ++i) + for (unsigned int i = 0; i < inputs_vector.size(); ++i) { ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i)); inputs_vector_info.emplace_back(inputs_vector.at(i)->info()); @@ -72,7 +70,9 @@ void NEConcatenateLayer::configure(std::vector inputs_vector, I _impl->op->configure(inputs_vector_info, _impl->dst->info(), axis); } -Status NEConcatenateLayer::validate(const std::vector &inputs_vector, const ITensorInfo *output, size_t axis) +Status NEConcatenateLayer::validate(const std::vector &inputs_vector, + const ITensorInfo *output, + size_t axis) { return cpu::CpuConcatenate::validate(inputs_vector, output, axis); } @@ -80,7 +80,7 @@ Status NEConcatenateLayer::validate(const std::vector &inpu void NEConcatenateLayer::run() { ITensorPack pack; - for(unsigned i = 0; i < _impl->num_inputs; ++i) + for (unsigned i = 0; i < _impl->num_inputs; ++i) { pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i)); } diff --git a/src/runtime/NEON/functions/NEConv3D.cpp b/src/runtime/NEON/functions/NEConv3D.cpp index 3bb66c44b0..8f41151d6c 100644 --- a/src/runtime/NEON/functions/NEConv3D.cpp +++ b/src/runtime/NEON/functions/NEConv3D.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include "src/cpu/operators/CpuDirectConv3d.h" @@ -35,35 +36,41 @@ using namespace arm_compute::experimental; struct NEConv3D::Impl { - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; }; -NEConv3D::NEConv3D() - : _impl(std::make_unique()) +NEConv3D::NEConv3D() : _impl(std::make_unique()) { } NEConv3D::~NEConv3D() = default; -void NEConv3D::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info) +void NEConv3D::configure( + ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuDirectConv3d::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info)); + ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuDirectConv3d::validate( + input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info)); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info); auto f = std::make_unique(); - f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info); + f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), + conv_info); _impl->op = std::move(f); - if(_impl->op != nullptr) + if (_impl->op != nullptr) { - _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } }; + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; } } -Status NEConv3D::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv3dInfo &conv_info) +Status NEConv3D::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const Conv3dInfo &conv_info) { ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuDirectConv3d::validate(input, weights, biases, output, conv_info)); @@ -72,7 +79,7 @@ Status NEConv3D::validate(const ITensorInfo *input, const ITensorInfo *weights, void NEConv3D::run() { - if(_impl->op != nullptr) + if (_impl->op != nullptr) { _impl->op->run(_impl->run_pack); } diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp index 535ac99001..84e8565aaf 100644 --- a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp +++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp @@ -24,24 +24,26 @@ #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuConvertFullyConnectedWeights.h" namespace arm_compute { struct NEConvertFullyConnectedWeights::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights() - : _impl(std::make_unique()) +NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights() : _impl(std::make_unique()) { } NEConvertFullyConnectedWeights::~NEConvertFullyConnectedWeights() = default; -void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, - DataLayout data_layout) +void NEConvertFullyConnectedWeights::configure(const ITensor *input, + ITensor *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -51,8 +53,10 @@ void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *ou _impl->op->configure(_impl->src->info(), _impl->dst->info(), original_input_shape, data_layout); } -Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, - DataLayout data_layout) +Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, + const ITensorInfo *output, + const TensorShape &original_input_shape, + DataLayout data_layout) { return cpu::CpuConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout); } @@ -64,4 +68,4 @@ void NEConvertFullyConnectedWeights::run() pack.add_tensor(TensorType::ACL_DST, _impl->dst); _impl->op->run(pack); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp index 89e0e498c9..37958fc2e9 100644 --- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuConv2d.h" @@ -43,34 +44,44 @@ struct NEConvolutionLayer::Impl { MemoryGroup memory_group{}; std::shared_ptr memory_manager{}; - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; ITensorPack prep_pack{}; WorkspaceData workspace{}; experimental::MemoryRequirements aux_mem_req{}; - std::unique_ptr func{ nullptr }; + std::unique_ptr func{nullptr}; }; -NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr memory_manager) - : _impl(std::make_unique()) +NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr memory_manager) : _impl(std::make_unique()) { _impl->memory_manager = std::move(memory_manager); } NEConvolutionLayer::~NEConvolutionLayer() = default; -void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +void NEConvolutionLayer::configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_UNUSED(num_groups); - ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, - enable_fast_math, num_groups)); - ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate( + input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, + weights_info, dilation, act_info, enable_fast_math, num_groups)); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, + enable_fast_math, num_groups); const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(cpu::CpuConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math)) + switch (cpu::CpuConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, + weights_info, dilation, act_info, enable_fast_math)) { case ConvolutionMethod::WINOGRAD: case ConvolutionMethod::GEMM: @@ -78,7 +89,8 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const case ConvolutionMethod::DIRECT: { auto f = std::make_unique(); - f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), + output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); _impl->op = std::move(f); break; } @@ -94,33 +106,46 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const break; } - if(_impl->op) + if (_impl->op) { _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager)); _impl->aux_mem_req = _impl->op->workspace(); - _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } }; - _impl->prep_pack = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } }; - _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}}; + _impl->workspace = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } } -Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +Status NEConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported"); - switch(cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math)) + switch (cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, + enable_fast_math)) { case ConvolutionMethod::WINOGRAD: case ConvolutionMethod::GEMM: case ConvolutionMethod::GEMM_CONV2D: case ConvolutionMethod::DIRECT: - ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups)); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuConv2d::validate(input, weights, biases, output, conv_info, + weights_info, dilation, act_info, enable_fast_math, + num_groups)); break; case ConvolutionMethod::FFT: - ARM_COMPUTE_RETURN_ON_ERROR(NEFFTConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEFFTConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info)); break; default: ARM_COMPUTE_ERROR("Not supported."); @@ -129,12 +154,17 @@ Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo return Status{}; } -ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, - const ActivationLayerInfo &act_info, bool enable_fast_math) +ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { - return cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math); + return cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, + enable_fast_math); } void NEConvolutionLayer::run() @@ -143,7 +173,7 @@ void NEConvolutionLayer::run() MemoryGroupResourceScope scope_mg(_impl->memory_group); - if(_impl->func) + if (_impl->func) { _impl->func->run(); } @@ -155,7 +185,7 @@ void NEConvolutionLayer::run() void NEConvolutionLayer::prepare() { - if(_impl->func) + if (_impl->func) { _impl->func->prepare(); } diff --git a/src/runtime/NEON/functions/NECopy.cpp b/src/runtime/NEON/functions/NECopy.cpp index c2059e8e98..c975d3a5b5 100644 --- a/src/runtime/NEON/functions/NECopy.cpp +++ b/src/runtime/NEON/functions/NECopy.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NECopy.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuCopy.h" #include @@ -32,16 +33,15 @@ namespace arm_compute { struct NECopy::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NECopy::NECopy() - : _impl(std::make_unique()) +NECopy::NECopy() : _impl(std::make_unique()) { } -NECopy::NECopy(NECopy &&) = default; +NECopy::NECopy(NECopy &&) = default; NECopy &NECopy::operator=(NECopy &&) = default; NECopy::~NECopy() = default; diff --git a/src/runtime/NEON/functions/NECropResize.cpp b/src/runtime/NEON/functions/NECropResize.cpp index cca8b400ee..a94b0882da 100644 --- a/src/runtime/NEON/functions/NECropResize.cpp +++ b/src/runtime/NEON/functions/NECropResize.cpp @@ -21,10 +21,11 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#include "arm_compute/runtime/NEON/NEScheduler.h" - #include "arm_compute/runtime/NEON/functions/NECropResize.h" + +#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/Tensor.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NECropKernel.h" @@ -35,18 +36,32 @@ namespace arm_compute NECropResize::~NECropResize() = default; NECropResize::NECropResize() - : _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _crop(), _scale(), _crop_results(), _scaled_results() + : _output(nullptr), + _num_boxes(0), + _method(), + _extrapolation_value(0), + _crop(), + _scale(), + _crop_results(), + _scaled_results() { } -Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes, const ITensorInfo *box_ind, const ITensorInfo *output, - Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value) +Status NECropResize::validate(const ITensorInfo *input, + const ITensorInfo *boxes, + const ITensorInfo *box_ind, + const ITensorInfo *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0); ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA); TensorInfo temp_info; - ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(), box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1, extrapolation_value)); - if(output->total_size() > 0) + ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(), + box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1, + extrapolation_value)); + if (output->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); @@ -56,11 +71,17 @@ Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes return Status{}; } -void NECropResize::configure(const ITensor *input, const ITensor *boxes, const ITensor *box_ind, ITensor *output, Coordinates2D crop_size, - InterpolationPolicy method, float extrapolation_value) +void NECropResize::configure(const ITensor *input, + const ITensor *boxes, + const ITensor *box_ind, + ITensor *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value)); + ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), + crop_size, method, extrapolation_value)); ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value); _num_boxes = boxes->info()->tensor_shape()[1]; @@ -81,7 +102,7 @@ void NECropResize::configure(const ITensor *input, const ITensor *boxes, const I _scaled_results.reserve(_num_boxes); _scale.reserve(_num_boxes); - for(unsigned int i = 0; i < _num_boxes; ++i) + for (unsigned int i = 0; i < _num_boxes; ++i) { auto crop_tensor = std::make_unique(); TensorInfo crop_result_info(1, DataType::F32); @@ -108,7 +129,7 @@ void NECropResize::run() { ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function"); - for(unsigned int i = 0; i < _num_boxes; ++i) + for (unsigned int i = 0; i < _num_boxes; ++i) { // Size of the crop box in _boxes and thus the shape of _crop_results[i] // may not be known until run-time and so the kernels cannot be configured until then. @@ -117,12 +138,15 @@ void NECropResize::run() NEScheduler::get().schedule(_crop[i].get(), Window::DimZ); // Scale the cropped image. - _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(), ScaleKernelInfo{ _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT, false }); + _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(), + ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), + SamplingPolicy::TOP_LEFT, false}); _scaled_results[i]->allocator()->allocate(); _scale[i]->run(); // Copy scaled image into output. - std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(), _output->ptr_to_element(Coordinates(0, 0, 0, i))); + std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(), + _output->ptr_to_element(Coordinates(0, 0, 0, i))); } } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp index 439aff0840..3987370d9e 100644 --- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp @@ -25,9 +25,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" @@ -61,7 +62,8 @@ PadStrideInfo compute_upsample_info(const PadStrideInfo &info, uint32_t deconv_p deconv_pad_top += deconv_pad_y / 2; deconv_pad_bottom += deconv_pad_y / 2; - return PadStrideInfo(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR); + return PadStrideInfo(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, + DimensionRoundingType::FLOOR); } } // namespace @@ -82,17 +84,24 @@ NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr memor { } -Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info, - bool enable_fast_math, const WeightsInfo &weights_info) +Status NEDeconvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *output, + const PadStrideInfo &info, + bool enable_fast_math, + const WeightsInfo &weights_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - const unsigned int width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); + const unsigned int width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const unsigned int height_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1); ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(height_idx) < 1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input); - if(is_data_type_quantized_per_channel(weights->data_type()) && is_data_type_quantized(input->data_type())) + if (is_data_type_quantized_per_channel(weights->data_type()) && is_data_type_quantized(input->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL); } @@ -101,11 +110,13 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); } - auto out_dims = deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), weights->dimension(height_idx), info); + auto out_dims = + deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx), + weights->dimension(width_idx), weights->dimension(height_idx), info); - if(bias != nullptr) + if (bias != nullptr) { - if(is_data_type_quantized_asymmetric(input->data_type())) + if (is_data_type_quantized_asymmetric(input->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); } @@ -115,15 +126,18 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf } } - if(output->tensor_shape().total_size() > 0) + if (output->tensor_shape().total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), + "Output's width is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), + "Output's height is invalid."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), + "Output's depth is invalid."); } uint32_t deconv_pad_x = 0; @@ -141,44 +155,61 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf ARM_COMPUTE_RETURN_ERROR_ON((out_x - weights->dimension(idx_w) + 1) > out_dims.first); ARM_COMPUTE_RETURN_ERROR_ON((out_y - weights->dimension(idx_h) + 1) > out_dims.second); - const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); - TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); + const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, + out_dims, deconv_pad_x, deconv_pad_y); + TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape)); const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y); // Do not perform upsampling when the operation uses unit stride in all dimensions const bool do_upsampling = stride_x != 1 || stride_y != 1; - const unsigned int batches_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); - const unsigned int channel_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); + const unsigned int batches_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES); + const unsigned int channel_idx = + get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) != scale_out_info.dimension(batches_idx)); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != scale_out_info.dimension(channel_idx)); - if(do_upsampling) + if (do_upsampling) { const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); - ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math)); + ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, + weights_info, Size2D(1U, 1U), ActivationLayerInfo(), + enable_fast_math)); } else { - const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL); - ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(input, weights, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math)); + const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), + upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL); + ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(input, weights, bias, output, conv_info, weights_info, + Size2D(1U, 1U), ActivationLayerInfo(), + enable_fast_math)); } return Status{}; } -void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info, bool enable_fast_math, const WeightsInfo &weights_info) +void NEDeconvolutionLayer::configure(ITensor *input, + const ITensor *weights, + const ITensor *bias, + ITensor *output, + const PadStrideInfo &info, + bool enable_fast_math, + const WeightsInfo &weights_info) { // Perform validation step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info, enable_fast_math, weights_info)); + ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), + (bias == nullptr) ? nullptr : bias->info(), + output->info(), info, enable_fast_math, weights_info)); ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, enable_fast_math, weights_info); const DataLayout data_layout = input->info()->data_layout(); const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - auto out_dims = deconvolution_output_dimensions(input->info()->dimension(width_idx), input->info()->dimension(height_idx), - weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info); + auto out_dims = deconvolution_output_dimensions( + input->info()->dimension(width_idx), input->info()->dimension(height_idx), + weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info); const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info()); @@ -191,7 +222,8 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con const unsigned int stride_y = info.stride().second; // Output auto initialization if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info()); + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), + input->info()->quantization_info()); _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32)); @@ -199,12 +231,11 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con _flip_weights.configure(weights, &_weights_flipped, &_flip_axis); // setup the function to convolve the upscaled output - uint32_t deconv_pad_x = 0; - uint32_t deconv_pad_y = 0; - const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), - stride_x, stride_y, - out_dims, deconv_pad_x, deconv_pad_y); - const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y); + uint32_t deconv_pad_x = 0; + uint32_t deconv_pad_y = 0; + const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape( + *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y); + const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y); // Do not perform upsampling when the operation uses unit stride in all dimensions _do_upsampling = stride_x != 1 || stride_y != 1; @@ -216,12 +247,12 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con axis_data[1] = static_cast(height_idx); // Setup convolution and upsampling, if needed - if(_do_upsampling) + if (_do_upsampling) { _memory_group.manage(&_scaled_output); const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); - TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info()); + TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info()); scale_out_info.set_data_layout(data_layout); _scaled_output.allocator()->init(scale_out_info); @@ -229,14 +260,17 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con // The padding amount can be given as input to the convolution layer. _upsample_f.configure(input, &_scaled_output, upsample_info); - _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math); + _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), + ActivationLayerInfo(), enable_fast_math); _scaled_output.allocator()->allocate(); } else { - const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL); - _conv_f.configure(input, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math); + const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), + upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL); + _conv_f.configure(input, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), + ActivationLayerInfo(), enable_fast_math); } } @@ -246,7 +280,7 @@ void NEDeconvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); - if(_do_upsampling) + if (_do_upsampling) { _upsample_f.run(); } @@ -255,7 +289,7 @@ void NEDeconvolutionLayer::run() void NEDeconvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { ARM_COMPUTE_ERROR_ON(!_original_weights->is_used()); diff --git a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp index 1ec32074a5..766635dfa1 100644 --- a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuCast.h" #include @@ -32,16 +33,15 @@ namespace arm_compute { struct NEDepthConvertLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEDepthConvertLayer::NEDepthConvertLayer() - : _impl(std::make_unique()) +NEDepthConvertLayer::NEDepthConvertLayer() : _impl(std::make_unique()) { } -NEDepthConvertLayer::NEDepthConvertLayer(NEDepthConvertLayer &&) = default; +NEDepthConvertLayer::NEDepthConvertLayer(NEDepthConvertLayer &&) = default; NEDepthConvertLayer &NEDepthConvertLayer::operator=(NEDepthConvertLayer &&) = default; NEDepthConvertLayer::~NEDepthConvertLayer() = default; @@ -59,7 +59,8 @@ void NEDepthConvertLayer::configure(const ITensor *input, ITensor *output, Conve _impl->op->configure(_impl->src->info(), _impl->dst->info(), policy); } -Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) +Status +NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) { ARM_COMPUTE_RETURN_ERROR_ON(shift != 0); return cpu::CpuCast::validate(input, output, policy); @@ -67,7 +68,7 @@ Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo void NEDepthConvertLayer::run() { - ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } }; + ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}}; _impl->op->run(pack); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp index f4a8a17e05..47564059ec 100644 --- a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h" diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp index 4dabef3bd7..6c085645db 100644 --- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/operators/CpuDepthwiseConv2d.h" @@ -39,38 +40,35 @@ NEDepthwiseConvolutionLayer::~NEDepthwiseConvolutionLayer() = default; struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::Impl { - ITensor *src{ nullptr }; // SRC_0 - ITensor *dst{ nullptr }; // DST_0 - const ITensor *weights - { - nullptr - }; // SRC_1 - const ITensor *biases - { - nullptr - }; // SRC_2 + ITensor *src{nullptr}; // SRC_0 + ITensor *dst{nullptr}; // DST_0 + const ITensor *weights{nullptr}; // SRC_1 + const ITensor *biases{nullptr}; // SRC_2 Tensor permuted_input{}; // INT_0 Tensor permuted_weights{}; // INT_1 Tensor permuted_output{}; // INT_2 Tensor workspace{}; // INT_3 Tensor packed_weights{}; // INT_4 - std::shared_ptr op{ nullptr }; - bool is_prepared{ false }; - bool permute{ false }; + std::shared_ptr op{nullptr}; + bool is_prepared{false}; + bool permute{false}; }; -NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr memory_manager) +NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal( + std::shared_ptr memory_manager) : _memory_group(memory_manager), _impl(std::make_unique()) { } -void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor *input, - const ITensor *weights, - const ITensor *biases, - ITensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, - const Size2D &dilation) +void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure( + ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); @@ -82,9 +80,9 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: _impl->permute = is_nhwc; _impl->op = std::make_unique(); - ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; - _impl->op->configure(_impl->src->info(), _impl->weights->info(), _impl->biases == nullptr ? nullptr : _impl->biases->info(), - _impl->dst->info(), info); + ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; + _impl->op->configure(_impl->src->info(), _impl->weights->info(), + _impl->biases == nullptr ? nullptr : _impl->biases->info(), _impl->dst->info(), info); // Configure pipeline ActivationLayerInfo act_info_to_use = ActivationLayerInfo(); @@ -92,15 +90,15 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info); bool is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6); - if(!is_activationlayer_enabled) + if (!is_activationlayer_enabled) { act_info_to_use = act_info; } - info = ConvolutionInfo{ conv_info, depth_multiplier, act_info_to_use, dilation }; + info = ConvolutionInfo{conv_info, depth_multiplier, act_info_to_use, dilation}; auto dwc_optimized_func = std::make_unique(); - if(is_nhwc) + if (is_nhwc) { auto permute_input = std::make_unique(); auto permute_weights = std::make_unique(); @@ -122,7 +120,9 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: _impl->permuted_output.info()->set_quantization_info(output->info()->quantization_info()); // Configure optimized depthwise - dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(), biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(), info); + dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(), + biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(), + info); // Configure the function to transform the convoluted output to ACL's native ordering format NCHW _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC); @@ -133,29 +133,33 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: } else { - dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(), biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info); + dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(), + biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info); } // Allocate memory based on the internal memory requirements experimental::MemoryRequirements mem_req = dwc_optimized_func->workspace(); - _impl->workspace.allocator()->init(TensorInfo(TensorShape{ mem_req[0].size + mem_req[0].alignment }, 1, DataType::S8), mem_req[0].alignment); - _impl->packed_weights.allocator()->init(TensorInfo(TensorShape{ mem_req[1].size + mem_req[1].alignment }, 1, DataType::S8), mem_req[1].alignment); + _impl->workspace.allocator()->init(TensorInfo(TensorShape{mem_req[0].size + mem_req[0].alignment}, 1, DataType::S8), + mem_req[0].alignment); + _impl->packed_weights.allocator()->init( + TensorInfo(TensorShape{mem_req[1].size + mem_req[1].alignment}, 1, DataType::S8), mem_req[1].alignment); _memory_group.manage(&_impl->workspace); _memory_group.manage(&_impl->packed_weights); _impl->workspace.allocator()->allocate(); _impl->packed_weights.allocator()->allocate(); } -Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo *input, - const ITensorInfo *weights, - const ITensorInfo *biases, - const ITensorInfo *output, - const PadStrideInfo &conv_info, - unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, - const Size2D &dilation) +Status +NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { - ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; + ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info); } @@ -180,15 +184,15 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal:: void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { // Permute weights - if(_impl->permute) + if (_impl->permute) { _impl->permuted_weights.allocator()->allocate(); } - if(!_impl->permuted_weights.is_used()) + if (!_impl->permuted_weights.is_used()) { _impl->permuted_weights.allocator()->free(); } @@ -202,14 +206,14 @@ struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::Impl Tensor permuted_input{}; Tensor permuted_weights{}; Tensor permuted_output{}; - bool is_prepared{ false }; - bool is_nchw{ false }; - bool is_activationlayer_enabled{ false }; - const ITensor *weights{ nullptr }; - const ITensor *biases{ nullptr }; - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::shared_ptr op{ nullptr }; + bool is_prepared{false}; + bool is_nchw{false}; + bool is_activationlayer_enabled{false}; + const ITensor *weights{nullptr}; + const ITensor *biases{nullptr}; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::shared_ptr op{nullptr}; }; NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric() @@ -217,14 +221,21 @@ NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConv { } -void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) +void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; + const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; _impl->op = std::make_unique(); - _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), info); + _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), + info); _impl->src = input; _impl->dst = output; @@ -236,7 +247,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure( ITensor *input_to_use = input; const ITensor *weights_to_use = weights; ITensor *output_to_use = output; - if(_impl->is_nchw) + if (_impl->is_nchw) { auto permute_input = std::make_unique(); auto permute_weights = std::make_unique(); @@ -249,14 +260,16 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure( _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC); weights_to_use = &_impl->permuted_weights; - _impl->permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape())); + _impl->permuted_output.allocator()->init( + output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape())); output_to_use = &_impl->permuted_output; } auto depthwise_conv_kernel = std::make_unique(); - depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info); + depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), + biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info); - if(_impl->is_nchw) + if (_impl->is_nchw) { auto permute_output = std::make_unique(); permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U)); @@ -268,11 +281,16 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure( } } -Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, +Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { - ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; + ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info); } @@ -298,49 +316,64 @@ NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr op{ nullptr }; + std::shared_ptr op{nullptr}; }; #endif // DOXYGEN_SKIP_THIS -void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, - const ActivationLayerInfo &act_info, const Size2D &dilation) +void NEDepthwiseConvolutionLayer::configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_LOG_PARAMS(input, weights, output, conv_info, depth_multiplier, biases, act_info, dilation); - ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), - output->info(), conv_info, depth_multiplier, act_info, dilation)); + ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate( + input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), output->info(), conv_info, + depth_multiplier, act_info, dilation)); - const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; + const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; _impl->op = std::make_shared(); - _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), - info); - switch(_impl->depth_conv_func) + _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function( + input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), info); + switch (_impl->depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: - _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); + _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, + dilation); break; case DepthwiseConvolutionFunction::GENERIC: - _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation); + _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, + dilation); break; default: ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction"); } } -Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation) +Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier, + const ActivationLayerInfo &act_info, + const Size2D &dilation) { - ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation }; + ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation}; return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info); } void NEDepthwiseConvolutionLayer::run() { - switch(_impl->depth_conv_func) + switch (_impl->depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: _impl->func_optimized.run(); @@ -355,7 +388,7 @@ void NEDepthwiseConvolutionLayer::run() void NEDepthwiseConvolutionLayer::prepare() { - switch(_impl->depth_conv_func) + switch (_impl->depth_conv_func) { case DepthwiseConvolutionFunction::OPTIMIZED: _impl->func_optimized.prepare(); diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp index 83e0131c83..28d19d2950 100644 --- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp +++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp @@ -26,19 +26,19 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Tensor.h" + #include "src/cpu/operators/CpuDequantize.h" namespace arm_compute { struct NEDequantizationLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEDequantizationLayer::NEDequantizationLayer() - : _impl(std::make_unique()) +NEDequantizationLayer::NEDequantizationLayer() : _impl(std::make_unique()) { } NEDequantizationLayer::~NEDequantizationLayer() = default; diff --git a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp index 1da8b012b3..b347390162 100644 --- a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp +++ b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include @@ -35,24 +36,36 @@ namespace arm_compute { NEDetectionPostProcessLayer::NEDetectionPostProcessLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _dequantize(), _detection_post_process(), _decoded_scores(), _run_dequantize(false) + : _memory_group(std::move(memory_manager)), + _dequantize(), + _detection_post_process(), + _decoded_scores(), + _run_dequantize(false) { } -void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores, const ITensor *input_anchors, - ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info) +void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, + const ITensor *input_scores, + const ITensor *input_anchors, + ITensor *output_boxes, + ITensor *output_classes, + ITensor *output_scores, + ITensor *num_detection, + DetectionPostProcessLayerInfo info) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores); - ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(), - output_scores->info(), - num_detection->info(), info)); - ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, num_detection, info); + ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, + output_scores); + ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate( + input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), + output_classes->info(), output_scores->info(), num_detection->info(), info)); + ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, + num_detection, info); const ITensor *input_scores_to_use = input_scores; DetectionPostProcessLayerInfo info_to_use = info; _run_dequantize = is_data_type_quantized(input_box_encoding->info()->data_type()); - if(_run_dequantize) + if (_run_dequantize) { _memory_group.manage(&_decoded_scores); @@ -61,26 +74,37 @@ void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, c input_scores_to_use = &_decoded_scores; // Create a new info struct to avoid dequantizing in the CPP layer - std::array scales_values{ info.scale_value_y(), info.scale_value_x(), info.scale_value_h(), info.scale_value_w() }; - DetectionPostProcessLayerInfo info_quantized(info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(), info.num_classes(), - scales_values, info.use_regular_nms(), info.detection_per_class(), false); + std::array scales_values{info.scale_value_y(), info.scale_value_x(), info.scale_value_h(), + info.scale_value_w()}; + DetectionPostProcessLayerInfo info_quantized( + info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(), + info.num_classes(), scales_values, info.use_regular_nms(), info.detection_per_class(), false); info_to_use = info_quantized; } - _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes, output_classes, output_scores, num_detection, info_to_use); + _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes, + output_classes, output_scores, num_detection, info_to_use); _decoded_scores.allocator()->allocate(); } -Status NEDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_scores, const ITensorInfo *input_anchors, - ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info) +Status NEDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, + const ITensorInfo *input_scores, + const ITensorInfo *input_anchors, + ITensorInfo *output_boxes, + ITensorInfo *output_classes, + ITensorInfo *output_scores, + ITensorInfo *num_detection, + DetectionPostProcessLayerInfo info) { bool run_dequantize = is_data_type_quantized(input_box_encoding->data_type()); - if(run_dequantize) + if (run_dequantize) { TensorInfo decoded_classes_info = input_scores->clone()->set_is_resizable(true).set_data_type(DataType::F32); ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(input_scores, &decoded_classes_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, num_detection, info)); + ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors, + output_boxes, output_classes, output_scores, + num_detection, info)); return Status{}; } @@ -90,7 +114,7 @@ void NEDetectionPostProcessLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Decode scores if necessary - if(_run_dequantize) + if (_run_dequantize) { _dequantize.run(); } diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp index ef3d3d6055..f1c2cf969f 100644 --- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp @@ -27,17 +27,18 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/cpu/operators/CpuDirectConv2d.h" namespace arm_compute { struct NEDirectConvolutionLayer::Impl { - ITensor *src{ nullptr }; - const ITensor *weights{ nullptr }; - const ITensor *bias{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + ITensor *src{nullptr}; + const ITensor *weights{nullptr}; + const ITensor *bias{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr memory_manager) @@ -46,17 +47,27 @@ NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptrsrc = input; _impl->weights = weights; _impl->bias = bias; _impl->dst = output; _impl->op = std::make_unique(_memory_manager); - _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), conv_info, act_info); + _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), + conv_info, act_info); } -Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info, +Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *output, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { return cpu::CpuDirectConv2d::validate(input, weights, bias, output, conv_info, act_info); diff --git a/src/runtime/NEON/functions/NEElementwiseOperations.cpp b/src/runtime/NEON/functions/NEElementwiseOperations.cpp index c958adf97c..685ef2d4d7 100644 --- a/src/runtime/NEON/functions/NEElementwiseOperations.cpp +++ b/src/runtime/NEON/functions/NEElementwiseOperations.cpp @@ -22,10 +22,11 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h" -#include "arm_compute/core/Validate.h" -#include "src/cpu/operators/CpuElementwise.h" #include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Validate.h" + +#include "src/cpu/operators/CpuElementwise.h" #include @@ -33,17 +34,16 @@ namespace arm_compute { struct NEElementwiseMax::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEElementwiseMax::NEElementwiseMax() - : _impl(std::make_unique()) +NEElementwiseMax::NEElementwiseMax() : _impl(std::make_unique()) { } -NEElementwiseMax::NEElementwiseMax(NEElementwiseMax &&) = default; +NEElementwiseMax::NEElementwiseMax(NEElementwiseMax &&) = default; NEElementwiseMax &NEElementwiseMax::operator=(NEElementwiseMax &&) = default; NEElementwiseMax::~NEElementwiseMax() = default; @@ -57,7 +57,10 @@ void NEElementwiseMax::configure(ITensor *input1, ITensor *input2, ITensor *outp _impl->op->configure(input1->info(), input2->info(), output->info()); } -Status NEElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEElementwiseMax::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return cpu::CpuElementwiseMax::validate(input1, input2, output); @@ -74,17 +77,16 @@ void NEElementwiseMax::run() struct NEElementwiseMin::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEElementwiseMin::NEElementwiseMin() - : _impl(std::make_unique()) +NEElementwiseMin::NEElementwiseMin() : _impl(std::make_unique()) { } -NEElementwiseMin::NEElementwiseMin(NEElementwiseMin &&) = default; +NEElementwiseMin::NEElementwiseMin(NEElementwiseMin &&) = default; NEElementwiseMin &NEElementwiseMin::operator=(NEElementwiseMin &&) = default; NEElementwiseMin::~NEElementwiseMin() = default; @@ -98,7 +100,10 @@ void NEElementwiseMin::configure(ITensor *input1, ITensor *input2, ITensor *outp _impl->op->configure(input1->info(), input2->info(), output->info()); } -Status NEElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEElementwiseMin::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return cpu::CpuElementwiseMin::validate(input1, input2, output); @@ -115,21 +120,23 @@ void NEElementwiseMin::run() struct NEElementwiseSquaredDiff::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEElementwiseSquaredDiff::NEElementwiseSquaredDiff() - : _impl(std::make_unique()) +NEElementwiseSquaredDiff::NEElementwiseSquaredDiff() : _impl(std::make_unique()) { } -NEElementwiseSquaredDiff::NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&) = default; +NEElementwiseSquaredDiff::NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&) = default; NEElementwiseSquaredDiff &NEElementwiseSquaredDiff::operator=(NEElementwiseSquaredDiff &&) = default; NEElementwiseSquaredDiff::~NEElementwiseSquaredDiff() = default; -void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) +void NEElementwiseSquaredDiff::configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); _impl->src_0 = input1; @@ -139,7 +146,10 @@ void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITens _impl->op->configure(input1->info(), input2->info(), output->info()); } -Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return cpu::CpuElementwiseSquaredDiff::validate(input1, input2, output); @@ -156,21 +166,23 @@ void NEElementwiseSquaredDiff::run() struct NEElementwiseDivision::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEElementwiseDivision::NEElementwiseDivision() - : _impl(std::make_unique()) +NEElementwiseDivision::NEElementwiseDivision() : _impl(std::make_unique()) { } -NEElementwiseDivision::NEElementwiseDivision(NEElementwiseDivision &&) = default; +NEElementwiseDivision::NEElementwiseDivision(NEElementwiseDivision &&) = default; NEElementwiseDivision &NEElementwiseDivision::operator=(NEElementwiseDivision &&) = default; NEElementwiseDivision::~NEElementwiseDivision() = default; -void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) +void NEElementwiseDivision::configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); _impl->src_0 = input1; @@ -180,7 +192,10 @@ void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor _impl->op->configure(input1->info(), input2->info(), output->info()); } -Status NEElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEElementwiseDivision::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return cpu::CpuElementwiseDivision::validate(input1, input2, output); @@ -197,21 +212,23 @@ void NEElementwiseDivision::run() struct NEElementwisePower::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEElementwisePower::NEElementwisePower() - : _impl(std::make_unique()) +NEElementwisePower::NEElementwisePower() : _impl(std::make_unique()) { } -NEElementwisePower::NEElementwisePower(NEElementwisePower &&) = default; +NEElementwisePower::NEElementwisePower(NEElementwisePower &&) = default; NEElementwisePower &NEElementwisePower::operator=(NEElementwisePower &&) = default; NEElementwisePower::~NEElementwisePower() = default; -void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) +void NEElementwisePower::configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); _impl->src_0 = input1; @@ -221,7 +238,10 @@ void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *ou _impl->op->configure(input1->info(), input2->info(), output->info()); } -Status NEElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEElementwisePower::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); return cpu::CpuElementwisePower::validate(input1, input2, output); @@ -239,22 +259,22 @@ void NEElementwisePower::run() template struct NEElementwiseComparisonStatic::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr> op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr> op{nullptr}; }; template -NEElementwiseComparisonStatic::NEElementwiseComparisonStatic() - : _impl(std::make_unique()) +NEElementwiseComparisonStatic::NEElementwiseComparisonStatic() : _impl(std::make_unique()) { } template NEElementwiseComparisonStatic::NEElementwiseComparisonStatic(NEElementwiseComparisonStatic &&) = default; -template -NEElementwiseComparisonStatic &NEElementwiseComparisonStatic::operator=(NEElementwiseComparisonStatic &&) = default; -template +template +NEElementwiseComparisonStatic & +NEElementwiseComparisonStatic::operator=(NEElementwiseComparisonStatic &&) = default; +template NEElementwiseComparisonStatic::~NEElementwiseComparisonStatic() = default; template @@ -268,13 +288,15 @@ void NEElementwiseComparisonStatic::configure(ITensor *input1, ITensor *inp } template -Status NEElementwiseComparisonStatic::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output) +Status NEElementwiseComparisonStatic::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output) { return cpu::CpuElementwiseComparisonStatic::validate(input1, input2, output); } template -void NEElementwiseComparisonStatic::run() +void NEElementwiseComparisonStatic::run() { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0); @@ -285,17 +307,16 @@ void NEElementwiseComparisonStatic::run() struct NEElementwiseComparison::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEElementwiseComparison::NEElementwiseComparison() - : _impl(std::make_unique()) +NEElementwiseComparison::NEElementwiseComparison() : _impl(std::make_unique()) { } -NEElementwiseComparison::NEElementwiseComparison(NEElementwiseComparison &&) = default; +NEElementwiseComparison::NEElementwiseComparison(NEElementwiseComparison &&) = default; NEElementwiseComparison &NEElementwiseComparison::operator=(NEElementwiseComparison &&) = default; NEElementwiseComparison::~NEElementwiseComparison() = default; @@ -308,7 +329,10 @@ void NEElementwiseComparison::configure(ITensor *input1, ITensor *input2, ITenso _impl->op->configure(input1->info(), input2->info(), output->info(), op); } -Status NEElementwiseComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op) +Status NEElementwiseComparison::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ComparisonOperation op) { return cpu::CpuElementwiseComparison::validate(input1, input2, output, op); } diff --git a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp index a0674ec320..23a092c407 100644 --- a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp +++ b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp @@ -22,7 +22,9 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h" + #include "src/cpu/operators/CpuElementwiseUnary.h" + #include namespace arm_compute @@ -32,21 +34,20 @@ using OperatorType = cpu::CpuElementwiseUnary; template struct NEElementwiseUnaryLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr cpu_op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr cpu_op{nullptr}; }; template -NEElementwiseUnaryLayer::NEElementwiseUnaryLayer() - : _impl(std::make_unique()) +NEElementwiseUnaryLayer::NEElementwiseUnaryLayer() : _impl(std::make_unique()) { } template NEElementwiseUnaryLayer::~NEElementwiseUnaryLayer() = default; template NEElementwiseUnaryLayer::NEElementwiseUnaryLayer(NEElementwiseUnaryLayer &&) = default; -template +template NEElementwiseUnaryLayer &NEElementwiseUnaryLayer::operator=(NEElementwiseUnaryLayer &&) = default; template @@ -65,7 +66,7 @@ Status NEElementwiseUnaryLayer::validate(const ITensorInfo *input, const ITe } template -void NEElementwiseUnaryLayer::run() +void NEElementwiseUnaryLayer::run() { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, _impl->src); diff --git a/src/runtime/NEON/functions/NEFFT1D.cpp b/src/runtime/NEON/functions/NEFFT1D.cpp index 343b817eba..fb75f9da29 100644 --- a/src/runtime/NEON/functions/NEFFT1D.cpp +++ b/src/runtime/NEON/functions/NEFFT1D.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h" #include "src/core/NEON/kernels/NEFFTRadixStageKernel.h" @@ -37,7 +38,15 @@ namespace arm_compute NEFFT1D::~NEFFT1D() = default; NEFFT1D::NEFFT1D(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _axis(0), _run_scale(false) + : _memory_group(std::move(memory_manager)), + _digit_reverse_kernel(), + _fft_kernels(), + _scale_kernel(), + _digit_reversed_input(), + _digit_reverse_indices(), + _num_ffts(0), + _axis(0), + _run_scale(false) { } @@ -74,7 +83,7 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo & _fft_kernels.resize(_num_ffts); _axis = config.axis; - for(unsigned int i = 0; i < _num_ffts; ++i) + for (unsigned int i = 0; i < _num_ffts; ++i) { const unsigned int radix_for_stage = decomposed_vector.at(i); @@ -84,19 +93,21 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo & fft_kernel_info.Nx = Nx; fft_kernel_info.is_first_stage = (i == 0); _fft_kernels[i] = std::make_unique(); - _fft_kernels[i]->configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info); + _fft_kernels[i]->configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, + fft_kernel_info); Nx *= radix_for_stage; } // Configure scale kernel - if(_run_scale) + if (_run_scale) { FFTScaleKernelInfo scale_config; scale_config.scale = static_cast(N); scale_config.conjugate = config.direction == FFTDirection::Inverse; _scale_kernel = std::make_unique(); - is_c2r ? _scale_kernel->configure(&_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config); + is_c2r ? _scale_kernel->configure(&_digit_reversed_input, output, scale_config) + : _scale_kernel->configure(output, nullptr, scale_config); } // Allocate tensors @@ -113,7 +124,7 @@ Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(std::set({ 0, 1 }).count(config.axis) == 0); + ARM_COMPUTE_RETURN_ERROR_ON(std::set({0, 1}).count(config.axis) == 0); // Check if FFT is decomposable const auto supported_radix = NEFFTRadixStageKernel::supported_radix(); @@ -122,7 +133,7 @@ Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty()); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { // All combinations are supported except real input with real output (i.e., both input channels set to 1) ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1); @@ -140,13 +151,13 @@ void NEFFT1D::run() NEScheduler::get().schedule(_digit_reverse_kernel.get(), (_axis == 0 ? Window::DimY : Window::DimZ)); - for(unsigned int i = 0; i < _num_ffts; ++i) + for (unsigned int i = 0; i < _num_ffts; ++i) { NEScheduler::get().schedule(_fft_kernels[i].get(), (_axis == 0 ? Window::DimY : Window::DimX)); } // Run output scaling - if(_run_scale) + if (_run_scale) { NEScheduler::get().schedule(_scale_kernel.get(), Window::DimY); } diff --git a/src/runtime/NEON/functions/NEFFT2D.cpp b/src/runtime/NEON/functions/NEFFT2D.cpp index ab422bd2ae..066909221d 100644 --- a/src/runtime/NEON/functions/NEFFT2D.cpp +++ b/src/runtime/NEON/functions/NEFFT2D.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Scheduler.h" + #include "src/common/utils/Log.h" namespace arm_compute @@ -33,7 +34,10 @@ namespace arm_compute NEFFT2D::~NEFFT2D() = default; NEFFT2D::NEFFT2D(std::shared_ptr memory_manager) - : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor() + : _memory_group(memory_manager), + _first_pass_func(memory_manager), + _second_pass_func(memory_manager), + _first_pass_tensor() { } @@ -78,7 +82,7 @@ Status NEFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, co ARM_COMPUTE_RETURN_ON_ERROR(NEFFT1D::validate(&first_pass_tensor, output, second_pass_config)); // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); diff --git a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp index 0551d756fb..94f85e5ffa 100644 --- a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp @@ -25,15 +25,16 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" #include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h" #include "src/core/NEON/kernels/NEFFTRadixStageKernel.h" #include "src/core/NEON/kernels/NEFFTScaleKernel.h" #include "src/core/NEON/kernels/NEPadLayerKernel.h" #include "src/core/NEON/kernels/NEReductionOperationKernel.h" -#include "src/core/helpers/AutoConfiguration.h" #include "src/core/utils/helpers/fft.h" namespace arm_compute @@ -46,11 +47,11 @@ int pad_decomposable(int N) int pad = 0; bool is_decomposed = false; - while(!is_decomposed) + while (!is_decomposed) { const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix); is_decomposed = !decomposed_vector.empty(); - if(!is_decomposed) + if (!is_decomposed) { ++pad; } @@ -102,8 +103,13 @@ NEFFTConvolutionLayer::NEFFTConvolutionLayer(std::shared_ptr mem } NEFFTConvolutionLayer::~NEFFTConvolutionLayer() = default; -void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +void NEFFTConvolutionLayer::configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_UNUSED(enable_fast_math); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math); @@ -115,21 +121,24 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co _has_bias = biases != nullptr; // Get indices for the width and height - const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); + const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_height = + get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); // Input shape, kernel size and output tile - const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]); - const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]); - const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1), - pad_decomposable(input_dims.y() + kernel_size.y() - 1)); + const Size2D input_dims = + Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]); + const Size2D kernel_size = + Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]); + const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1), + pad_decomposable(input_dims.y() + kernel_size.y() - 1)); // Tensors to use ITensor *input_to_use = input; const ITensor *weights_to_use = weights; ITensor *output_to_use = _has_bias ? &_bias_output : output; // Permute bias - if(biases != nullptr) + if (biases != nullptr) { _permute_bias_func.configure(biases, &_permuted_bias, PermutationVector(1U, 2U, 0U)); _permuted_bias.info()->set_data_layout(DataLayout::NCHW); @@ -137,7 +146,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co // Permute input if needed _needs_permute = input->info()->data_layout() == DataLayout::NHWC; - if(_needs_permute) + if (_needs_permute) { _memory_group.manage(&_permuted_input); // Configure the function to transform the input tensor from NHWC -> NCHW @@ -158,7 +167,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co _flip_weights_func.configure(weights_to_use, &_flipped_weights, &_flip_axis); // Pad weights - const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } }; + const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}}; _pad_weights_func.configure(&_flipped_weights, &_padded_weights, padding_w); // Transform weights @@ -166,10 +175,10 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co _transform_weights_func->configure(&_padded_weights, &_transformed_weights, FFT2DInfo()); // Pad input - const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } }; + const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}}; _memory_group.manage(&_padded_input); _pad_input_func.configure(input_to_use, &_padded_input, padding_in); - if(_needs_permute) + if (_needs_permute) { _permuted_input.allocator()->allocate(); } @@ -193,7 +202,8 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co _memory_group.manage(&_itransformed_output); FFT2DInfo itranform_info; itranform_info.direction = FFTDirection::Inverse; - _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding()); + _itransformed_output.allocator()->init( + _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding()); _itransform_output_func.configure(&_output_reduced, &_itransformed_output, itranform_info); _output_reduced.allocator()->allocate(); @@ -205,26 +215,29 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co // Extract correct region const int start_left = kernel_size.x() - conv_info.pad_left() - 1; const int start_top = kernel_size.y() - conv_info.pad_top() - 1; - const int end_right = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x(); - const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y(); - if(_has_bias) + const int end_right = + _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x(); + const int end_botton = + _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y(); + if (_has_bias) { _memory_group.manage(&_bias_output); } - else if(_needs_permute) + else if (_needs_permute) { output_to_use = &_permuted_output; _memory_group.manage(&_permuted_output); } - _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton)); + _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), + Coordinates(end_right, end_botton)); _reshaped_output.allocator()->allocate(); _itransformed_output.allocator()->allocate(); // Add bias - if(biases != nullptr) + if (biases != nullptr) { output_to_use = output; - if(_needs_permute) + if (_needs_permute) { output_to_use = &_permuted_output; _memory_group.manage(&_permuted_output); @@ -235,7 +248,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co } // Permute output - if(_needs_permute) + if (_needs_permute) { // Configure the function to transform the convoluted output to ACL's native ordering format NCHW _permuted_output.info()->set_data_layout(DataLayout::NCHW); @@ -247,7 +260,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co // Configure Activation Layer _is_activationlayer_enabled = act_info.enabled(); - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activation_layer_func.configure(output, nullptr, act_info); } @@ -260,8 +273,13 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co axis_data[1] = 1; } -Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_UNUSED(enable_fast_math); @@ -279,11 +297,13 @@ Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn const auto strides = conv_info.stride(); ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1); ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y()); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2)); - ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || + conv_info.pad_right() != (kernel_size.x() / 2)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || + conv_info.pad_bottom() != (kernel_size.y() / 2)); // Validate biases - if(biases != nullptr) + if (biases != nullptr) { const size_t idx_channels = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); @@ -291,13 +311,14 @@ Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn } // Checks performed when output is configured - if((output != nullptr) && (output->total_size() != 0)) + if ((output != nullptr) && (output->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width])); + ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || + (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width])); // Validate Activation Layer - if(act_info.enabled()) + if (act_info.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info)); } @@ -313,7 +334,7 @@ void NEFFTConvolutionLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Transform input - if(_needs_permute) + if (_needs_permute) { _permute_input_func.run(); } @@ -331,17 +352,17 @@ void NEFFTConvolutionLayer::run() _extract_output_func.run(); // Add bias - if(_has_bias) + if (_has_bias) { _bias_add_func.run(); } - if(_needs_permute) + if (_needs_permute) { _permute_output_func.run(); } // Run activation layer - if(_is_activationlayer_enabled) + if (_is_activationlayer_enabled) { _activation_layer_func.run(); } @@ -349,10 +370,10 @@ void NEFFTConvolutionLayer::run() void NEFFTConvolutionLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { // Permute bias to NCHW - if(_original_bias != nullptr) + if (_original_bias != nullptr) { _permuted_bias.allocator()->allocate(); _permute_bias_func.run(); @@ -362,7 +383,7 @@ void NEFFTConvolutionLayer::prepare() const ITensor *cur_weights = _original_weights; // Permute weights - if(_needs_permute) + if (_needs_permute) { ARM_COMPUTE_ERROR_ON(!cur_weights->is_used()); diff --git a/src/runtime/NEON/functions/NEFill.cpp b/src/runtime/NEON/functions/NEFill.cpp index 43667783bf..bc1d5b7f5c 100644 --- a/src/runtime/NEON/functions/NEFill.cpp +++ b/src/runtime/NEON/functions/NEFill.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEFill.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuFill.h" #include @@ -32,15 +33,14 @@ namespace arm_compute { struct NEFill::Impl { - ITensor *tensor{ nullptr }; - std::unique_ptr op{ nullptr }; + ITensor *tensor{nullptr}; + std::unique_ptr op{nullptr}; }; -NEFill::NEFill() - : _impl(std::make_unique()) +NEFill::NEFill() : _impl(std::make_unique()) { } -NEFill::NEFill(NEFill &&) = default; +NEFill::NEFill(NEFill &&) = default; NEFill &NEFill::operator=(NEFill &&) = default; NEFill::~NEFill() = default; diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp index d633e340f8..a3ab9c3db4 100644 --- a/src/runtime/NEON/functions/NEFillBorder.cpp +++ b/src/runtime/NEON/functions/NEFillBorder.cpp @@ -25,17 +25,20 @@ #include "arm_compute/core/Window.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEFillBorderKernel.h" namespace arm_compute { -NEFillBorder::NEFillBorder() - : _border_handler(nullptr) +NEFillBorder::NEFillBorder() : _border_handler(nullptr) { } -void NEFillBorder::configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value) +void NEFillBorder::configure(ITensor *input, + unsigned int border_width, + BorderMode border_mode, + const PixelValue &constant_border_value) { ARM_COMPUTE_LOG_PARAMS(input, border_width, border_mode, constant_border_value); _border_handler = std::make_unique(); diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp index f435842634..56db2be3fa 100644 --- a/src/runtime/NEON/functions/NEFlattenLayer.cpp +++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp @@ -24,8 +24,9 @@ #include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/cpu/operators/CpuFlatten.h" @@ -33,16 +34,15 @@ namespace arm_compute { struct NEFlattenLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEFlattenLayer::NEFlattenLayer() - : _impl(std::make_unique()) +NEFlattenLayer::NEFlattenLayer() : _impl(std::make_unique()) { } -NEFlattenLayer::NEFlattenLayer(NEFlattenLayer &&) = default; +NEFlattenLayer::NEFlattenLayer(NEFlattenLayer &&) = default; NEFlattenLayer &NEFlattenLayer::operator=(NEFlattenLayer &&) = default; NEFlattenLayer::~NEFlattenLayer() = default; @@ -51,7 +51,8 @@ void NEFlattenLayer::configure(const ITensor *input, ITensor *output) ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); _impl->src = input; _impl->dst = output; - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input->info()))); + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape( + misc::shape_calculator::compute_flatten_shape(input->info()))); _impl->op = std::make_unique(); _impl->op->configure(_impl->src->info(), _impl->dst->info()); @@ -60,9 +61,10 @@ void NEFlattenLayer::configure(const ITensor *input, ITensor *output) Status NEFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output) { // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input)); + const TensorInfo tensor_info_output = + input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); } return cpu::CpuFlatten::validate(input, output); diff --git a/src/runtime/NEON/functions/NEFloor.cpp b/src/runtime/NEON/functions/NEFloor.cpp index d2dc48a159..112c93c478 100644 --- a/src/runtime/NEON/functions/NEFloor.cpp +++ b/src/runtime/NEON/functions/NEFloor.cpp @@ -24,22 +24,22 @@ #include "arm_compute/runtime/NEON/functions/NEFloor.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuFloor.h" namespace arm_compute { struct NEFloor::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEFloor::NEFloor() - : _impl(std::make_unique()) +NEFloor::NEFloor() : _impl(std::make_unique()) { } -NEFloor::NEFloor(NEFloor &&) = default; +NEFloor::NEFloor(NEFloor &&) = default; NEFloor &NEFloor::operator=(NEFloor &&) = default; NEFloor::~NEFloor() = default; diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp index 891487efd3..2656d0fa0f 100644 --- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp +++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuFullyConnected.h" @@ -38,80 +39,90 @@ using namespace arm_compute::experimental; struct NEFullyConnectedLayer::Impl { MemoryGroup memory_group{}; - IWeightsManager *weights_manager{ nullptr }; + IWeightsManager *weights_manager{nullptr}; - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; - const ITensor *original_weights{ nullptr }; + const ITensor *original_weights{nullptr}; ITensorPack run_pack{}; WorkspaceData workspace{}; experimental::MemoryRequirements aux_mem_req{}; - bool is_prepared{ false }; - bool dynamic_weights{ false }; + bool is_prepared{false}; + bool dynamic_weights{false}; }; NEFullyConnectedLayer::~NEFullyConnectedLayer() = default; -NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr memory_manager, IWeightsManager *weights_manager) +NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr memory_manager, + IWeightsManager *weights_manager) : _impl(std::make_unique()) { _impl->memory_group = MemoryGroup(std::move(memory_manager)); _impl->weights_manager = weights_manager; } -void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, - FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info) +void NEFullyConnectedLayer::configure(const ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + FullyConnectedLayerInfo fc_info, + const WeightsInfo &weights_info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(), - weights->info(), + ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, - output->info(), - fc_info, - weights_info)); + output->info(), fc_info, weights_info)); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, fc_info); _impl->op = std::make_unique(); _impl->original_weights = weights; _impl->is_prepared = false; - _impl->op->configure(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info, weights_info); + _impl->op->configure(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), + fc_info, weights_info); - if(_impl->weights_manager != nullptr) + if (_impl->weights_manager != nullptr) { _impl->weights_manager->manage(_impl->original_weights); } _impl->aux_mem_req = _impl->op->workspace(); - _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } }; - _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); - - _impl->dynamic_weights = - !weights->info()->are_values_constant() && - fc_info.transpose_weights && - !fc_info.are_weights_reshaped && - !fc_info.retain_internal_weights; + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->workspace = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); + + _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights && + !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights; } -Status NEFullyConnectedLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *biases, const ITensorInfo *output, const FullyConnectedLayerInfo &fc_info, - const WeightsInfo &weights_info) +Status NEFullyConnectedLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const FullyConnectedLayerInfo &fc_info, + const WeightsInfo &weights_info) { - return cpu::CpuFullyConnected::has_opt_impl(expected_weight_format, input, weights, biases, output, fc_info, weights_info); + return cpu::CpuFullyConnected::has_opt_impl(expected_weight_format, input, weights, biases, output, fc_info, + weights_info); } -Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info) +Status NEFullyConnectedLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + FullyConnectedLayerInfo fc_info, + const WeightsInfo &weights_info) { return cpu::CpuFullyConnected::validate(input, weights, biases, output, fc_info, weights_info); } void NEFullyConnectedLayer::run() { - if(!_impl->dynamic_weights) + if (!_impl->dynamic_weights) { prepare(); } @@ -122,7 +133,7 @@ void NEFullyConnectedLayer::run() void NEFullyConnectedLayer::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->run_pack); @@ -131,13 +142,13 @@ void NEFullyConnectedLayer::prepare() _impl->is_prepared = true; // Handle weights managed infrastructure - if(_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights)) + if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights)) { // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare // This is for cases where multiple functions share the same b (weights) // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference const ITensor *original_b = _impl->original_weights; - if(!original_b->is_used()) + if (!original_b->is_used()) { _impl->weights_manager->pre_mark_as_unused(original_b); } diff --git a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp index 6612845d86..f5b8b57dac 100644 --- a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp +++ b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h" @@ -35,29 +36,42 @@ namespace arm_compute { NEFuseBatchNormalization::~NEFuseBatchNormalization() = default; -NEFuseBatchNormalization::NEFuseBatchNormalization() - : _fuse_bn_kernel() +NEFuseBatchNormalization::NEFuseBatchNormalization() : _fuse_bn_kernel() { } -void NEFuseBatchNormalization::configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, - ITensor *fused_weights, ITensor *fused_bias, - const ITensor *input_bias, const ITensor *bn_beta, const ITensor *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +void NEFuseBatchNormalization::configure(const ITensor *input_weights, + const ITensor *bn_mean, + const ITensor *bn_var, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *input_bias, + const ITensor *bn_beta, + const ITensor *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, - bn_beta, bn_gamma, epsilon, fbn_type); + ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, + epsilon, fbn_type); _fuse_bn_kernel = std::make_unique(); - _fuse_bn_kernel->configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + _fuse_bn_kernel->configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, + epsilon, fbn_type); } -Status NEFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma, - float epsilon, FuseBatchNormalizationType fbn_type) +Status NEFuseBatchNormalization::validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias, + const ITensorInfo *bn_beta, + const ITensorInfo *bn_gamma, + float epsilon, + FuseBatchNormalizationType fbn_type) { - return NEFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type); + return NEFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, + input_bias, bn_beta, bn_gamma, epsilon, fbn_type); } void NEFuseBatchNormalization::run() diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp index e51f2f9eb6..934a8250cc 100644 --- a/src/runtime/NEON/functions/NEGEMM.cpp +++ b/src/runtime/NEON/functions/NEGEMM.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuGemm.h" @@ -39,12 +40,12 @@ namespace arm_compute struct NEGEMM::Impl { MemoryGroup memory_group{}; - IWeightsManager *weights_manager{ nullptr }; + IWeightsManager *weights_manager{nullptr}; - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; - const ITensor *original_b{ nullptr }; - bool is_prepared{ false }; + const ITensor *original_b{nullptr}; + bool is_prepared{false}; ITensorPack run_pack{}; ITensorPack prep_pack{}; @@ -61,10 +62,17 @@ NEGEMM::NEGEMM(std::shared_ptr memory_manager, IWeightsManager * NEGEMM::~NEGEMM() = default; -void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info) +void NEGEMM::configure(const ITensor *a, + const ITensor *b, + const ITensor *c, + ITensor *d, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); - ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemm::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info)); + ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemm::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, + d->info(), alpha, beta, gemm_info)); // Check if we need to reshape the matrix B only on the first run _impl->is_prepared = false; @@ -73,24 +81,32 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe // Make the B matrix dynamic values. auto b_info_to_use = b->info()->clone(); - if(!gemm_info.reshape_b_only_on_first_run()) + if (!gemm_info.reshape_b_only_on_first_run()) { b_info_to_use->set_are_values_constant(false); } - _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info); + _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, + gemm_info); _impl->aux_mem_req = _impl->op->workspace(); - _impl->run_pack = { { ACL_SRC_0, a }, { ACL_SRC_1, b }, { ACL_SRC_2, c }, { ACL_DST, d } }; - _impl->prep_pack = { { ACL_SRC_1, b }, { ACL_SRC_2, c } }; - _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_SRC_2, c}, {ACL_DST, d}}; + _impl->prep_pack = {{ACL_SRC_1, b}, {ACL_SRC_2, c}}; + _impl->workspace = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } -Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status NEGEMM::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { // Make the B matrix dynamic values. auto b_to_use = b->clone(); - if(!gemm_info.reshape_b_only_on_first_run()) + if (!gemm_info.reshape_b_only_on_first_run()) { b_to_use->set_are_values_constant(false); } @@ -98,8 +114,14 @@ Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso return cpu::CpuGemm::validate(a, b_to_use.get(), c, output, alpha, beta, gemm_info); } -Status NEGEMM::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, - float alpha, float beta, const GEMMInfo &gemm_info) +Status NEGEMM::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha, beta); return cpu::CpuGemm::has_opt_impl(expected_weight_format, a, b, c, output, gemm_info); @@ -115,15 +137,15 @@ void NEGEMM::run() void NEGEMM::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->prep_pack); - auto has_reshape = std::find_if(_impl->aux_mem_req.begin(), - _impl->aux_mem_req.end(), - [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - if(has_reshape != std::end(_impl->aux_mem_req)) + if (has_reshape != std::end(_impl->aux_mem_req)) { _impl->original_b->mark_as_unused(); } diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp index 42b8b70405..6cca02eea9 100644 --- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp +++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuGemmDirectConv2d.h" @@ -35,25 +36,25 @@ using namespace arm_compute::experimental; struct NEGEMMConv2d::Impl { - const ITensor *weights{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *weights{nullptr}; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; ITensorPack prep_pack{}; WorkspaceData workspace{}; MemoryGroup memory_group{}; - bool is_prepared{ false }; + bool is_prepared{false}; experimental::MemoryRequirements aux_mem_req{}; }; -NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr &memory_manager) - : _impl(std::make_unique()) +NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr &memory_manager) : _impl(std::make_unique()) { _impl->memory_group = MemoryGroup(memory_manager); } NEGEMMConv2d::~NEGEMMConv2d() = default; -void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info) +void NEGEMMConv2d::configure( + ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); @@ -61,15 +62,21 @@ void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITens _impl->is_prepared = false; _impl->op = std::make_unique(); - _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), info); + _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + info); _impl->aux_mem_req = _impl->op->workspace(); - _impl->run_pack = { { TensorType::ACL_SRC_0, input }, { TensorType::ACL_SRC_2, biases }, { TensorType::ACL_DST, output } }; - _impl->prep_pack = { { TensorType::ACL_SRC_1, weights }, { TensorType::ACL_SRC_2, biases } }; - _impl->workspace = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->run_pack = {{TensorType::ACL_SRC_0, input}, {TensorType::ACL_SRC_2, biases}, {TensorType::ACL_DST, output}}; + _impl->prep_pack = {{TensorType::ACL_SRC_1, weights}, {TensorType::ACL_SRC_2, biases}}; + _impl->workspace = + manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack); } -Status NEGEMMConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info) +Status NEGEMMConv2d::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const Conv2dInfo &info) { return OperatorType::validate(input, weights, biases, output, info); } @@ -84,15 +91,15 @@ void NEGEMMConv2d::run() void NEGEMMConv2d::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->prep_pack); - auto has_reshape = std::find_if(_impl->aux_mem_req.begin(), - _impl->aux_mem_req.end(), - [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - if(has_reshape != std::end(_impl->aux_mem_req)) + if (has_reshape != std::end(_impl->aux_mem_req)) { _impl->weights->mark_as_unused(); } diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp index fe3ea6a767..c8f65d2fd9 100644 --- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp @@ -27,6 +27,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuGemmConv2d.h" @@ -36,17 +37,18 @@ namespace arm_compute { struct NEGEMMConvolutionLayer::Impl { - const ITensor *weights{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *weights{nullptr}; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; MemoryGroup memory_group{}; - IWeightsManager *weights_manager{ nullptr }; + IWeightsManager *weights_manager{nullptr}; MemoryRequirements aux_mem_req{}; WorkspaceData workspace_tensors{}; - bool is_prepared{ false }; + bool is_prepared{false}; }; -NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr &memory_manager, IWeightsManager *weights_manager) +NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr &memory_manager, + IWeightsManager *weights_manager) : _impl(std::make_unique()) { _impl->weights_manager = weights_manager; @@ -54,37 +56,61 @@ NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptrweights = weights; _impl->op = std::make_unique(); - _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), + conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); - _impl->run_pack = - { - { TensorType::ACL_SRC_0, input }, - { TensorType::ACL_SRC_1, weights }, - { TensorType::ACL_SRC_2, biases }, - { TensorType::ACL_DST, output } - }; - _impl->aux_mem_req = _impl->op->workspace(); - _impl->workspace_tensors = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); + _impl->run_pack = {{TensorType::ACL_SRC_0, input}, + {TensorType::ACL_SRC_1, weights}, + {TensorType::ACL_SRC_2, biases}, + {TensorType::ACL_DST, output}}; + _impl->aux_mem_req = _impl->op->workspace(); + _impl->workspace_tensors = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack); } -Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) +Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) { - return cpu::CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups); + return cpu::CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, + enable_fast_math, num_groups); } -Status NEGEMMConvolutionLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, const bool enable_fast_math) +Status NEGEMMConvolutionLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + const bool enable_fast_math) { - return cpu::CpuGemmConv2d::has_opt_impl(expected_weight_format, src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math); + return cpu::CpuGemmConv2d::has_opt_impl(expected_weight_format, src, weights, biases, dst, conv_info, weights_info, + dilation, act_info, enable_fast_math); } void NEGEMMConvolutionLayer::run() @@ -96,7 +122,7 @@ void NEGEMMConvolutionLayer::run() void NEGEMMConvolutionLayer::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->run_pack); diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index 453d3cedef..44bfc6a51e 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -29,8 +29,8 @@ #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/Tensor.h" -#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h" using namespace arm_compute::experimental; @@ -39,18 +39,19 @@ namespace arm_compute { struct NEGEMMLowpMatrixMultiplyCore::Impl { - const ITensor *b{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *b{nullptr}; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; ITensorPack prep_pack{}; MemoryGroup memory_group{}; - IWeightsManager *weights_manager{ nullptr }; + IWeightsManager *weights_manager{nullptr}; MemoryRequirements aux_mem_req{}; WorkspaceData workspace_tensors{}; - bool is_prepared{ false }; + bool is_prepared{false}; }; -NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager, IWeightsManager *weights_manager) +NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager, + IWeightsManager *weights_manager) : _impl(std::make_unique()) { _impl->weights_manager = weights_manager; @@ -58,41 +59,41 @@ NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptrinfo()->clone(); - if(!gemm_info.reshape_b_only_on_first_run()) + if (!gemm_info.reshape_b_only_on_first_run()) { b_info_to_use->set_are_values_constant(false); } _impl->b = b; _impl->op = std::make_unique(); - _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr ? c->info() : nullptr), output->info(), gemm_info); - _impl->run_pack = - { - { TensorType::ACL_SRC_0, a }, - { TensorType::ACL_SRC_1, b }, - { TensorType::ACL_SRC_2, c }, - { TensorType::ACL_DST, output } - }; - _impl->prep_pack = - { - { TensorType::ACL_SRC_1, b }, - { TensorType::ACL_SRC_2, c } - }; - _impl->aux_mem_req = _impl->op->workspace(); - _impl->workspace_tensors = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr ? c->info() : nullptr), output->info(), + gemm_info); + _impl->run_pack = {{TensorType::ACL_SRC_0, a}, + {TensorType::ACL_SRC_1, b}, + {TensorType::ACL_SRC_2, c}, + {TensorType::ACL_DST, output}}; + _impl->prep_pack = {{TensorType::ACL_SRC_1, b}, {TensorType::ACL_SRC_2, c}}; + _impl->aux_mem_req = _impl->op->workspace(); + _impl->workspace_tensors = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } -Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info) +Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info) { // Make the B matrix dynamic values. auto b_info_to_use = b->clone(); - if(!gemm_info.reshape_b_only_on_first_run()) + if (!gemm_info.reshape_b_only_on_first_run()) { b_info_to_use->set_are_values_constant(false); } @@ -109,15 +110,15 @@ void NEGEMMLowpMatrixMultiplyCore::run() void NEGEMMLowpMatrixMultiplyCore::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->prep_pack); - auto has_reshape = std::find_if(_impl->aux_mem_req.begin(), - _impl->aux_mem_req.end(), - [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); + auto has_reshape = + std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(), + [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; }); - if(has_reshape != std::end(_impl->aux_mem_req)) + if (has_reshape != std::end(_impl->aux_mem_req)) { _impl->b->mark_as_unused(); } diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp index 7e1de3c257..8178003b5e 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp @@ -25,45 +25,48 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuGemmLowpOutputStage.h" namespace arm_compute { struct NEGEMMLowpOutputStage::Impl { - const ITensor *src{ nullptr }; - const ITensor *bias{ nullptr }; - ITensor *dst{ nullptr }; + const ITensor *src{nullptr}; + const ITensor *bias{nullptr}; + ITensor *dst{nullptr}; ITensorPack run_pack{}; - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; }; -NEGEMMLowpOutputStage::NEGEMMLowpOutputStage() - : _impl(std::make_unique()) +NEGEMMLowpOutputStage::NEGEMMLowpOutputStage() : _impl(std::make_unique()) { } NEGEMMLowpOutputStage::~NEGEMMLowpOutputStage() = default; -void NEGEMMLowpOutputStage::configure(const ITensor *input, const ITensor *bias, ITensor *output, const GEMMLowpOutputStageInfo &info) +void NEGEMMLowpOutputStage::configure(const ITensor *input, + const ITensor *bias, + ITensor *output, + const GEMMLowpOutputStageInfo &info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpOutputStage::validate(input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON( + NEGEMMLowpOutputStage::validate(input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info)); _impl->src = input; _impl->bias = bias; _impl->dst = output; _impl->op = std::make_unique(); _impl->op->configure(input->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info); - _impl->run_pack = - { - { TensorType::ACL_SRC, _impl->src }, - { TensorType::ACL_BIAS, _impl->bias }, - { TensorType::ACL_DST, _impl->dst } - }; + _impl->run_pack = { + {TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_BIAS, _impl->bias}, {TensorType::ACL_DST, _impl->dst}}; } -Status NEGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info) +Status NEGEMMLowpOutputStage::validate(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const GEMMLowpOutputStageInfo &info) { return cpu::CpuGemmLowpOutputStage::validate(input, bias, output, info); } diff --git a/src/runtime/NEON/functions/NEGather.cpp b/src/runtime/NEON/functions/NEGather.cpp index f5d19c769e..62b8cfa48b 100644 --- a/src/runtime/NEON/functions/NEGather.cpp +++ b/src/runtime/NEON/functions/NEGather.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEGather.h" -#include "src/core/NEON/kernels/NEGatherKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEGatherKernel.h" #include diff --git a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp index 1c0e736766..1022b4153e 100644 --- a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp +++ b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp @@ -25,11 +25,12 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" #include "src/core/NEON/kernels/NEFillBorderKernel.h" #include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h" #include "src/core/NEON/kernels/NEPadLayerKernel.h" -#include "src/core/helpers/AutoConfiguration.h" namespace arm_compute { @@ -68,42 +69,55 @@ NEGenerateProposalsLayer::NEGenerateProposalsLayer(std::shared_ptrinfo(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(NEGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), + proposals->info(), scores_out->info(), + num_valid_proposals->info(), info)); ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info); _is_nhwc = scores->info()->data_layout() == DataLayout::NHWC; const DataType scores_data_type = scores->info()->data_type(); _is_qasymm8 = scores_data_type == DataType::QASYMM8; - const int num_anchors = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL)); - const int feat_width = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH)); - const int feat_height = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT)); - const int total_num_anchors = num_anchors * feat_width * feat_height; - const int pre_nms_topN = info.pre_nms_topN(); - const int post_nms_topN = info.post_nms_topN(); - const size_t values_per_roi = info.values_per_roi(); + const int num_anchors = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL)); + const int feat_width = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH)); + const int feat_height = scores->info()->dimension( + get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT)); + const int total_num_anchors = num_anchors * feat_width * feat_height; + const int pre_nms_topN = info.pre_nms_topN(); + const int post_nms_topN = info.post_nms_topN(); + const size_t values_per_roi = info.values_per_roi(); const QuantizationInfo scores_qinfo = scores->info()->quantization_info(); const DataType rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type; - const QuantizationInfo rois_qinfo = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info(); + const QuantizationInfo rois_qinfo = + (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info(); // Compute all the anchors _memory_group.manage(&_all_anchors); _compute_anchors = std::make_unique(); - _compute_anchors->configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); + _compute_anchors->configure(anchors, &_all_anchors, + ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())); const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors); - _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info())); + _deltas_flattened.allocator()->init( + TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info())); // Permute and reshape deltas _memory_group.manage(&_deltas_flattened); - if(!_is_nhwc) + if (!_is_nhwc) { _memory_group.manage(&_deltas_permuted); - _permute_deltas.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 }); + _permute_deltas.configure(deltas, &_deltas_permuted, PermutationVector{2, 0, 1}); _flatten_deltas.configure(&_deltas_permuted, &_deltas_flattened); _deltas_permuted.allocator()->allocate(); } @@ -117,10 +131,10 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d // Permute and reshape scores _memory_group.manage(&_scores_flattened); - if(!_is_nhwc) + if (!_is_nhwc) { _memory_group.manage(&_scores_permuted); - _permute_scores.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 }); + _permute_scores.configure(scores, &_scores_permuted, PermutationVector{2, 0, 1}); _flatten_scores.configure(&_scores_permuted, &_scores_flattened); _scores_permuted.allocator()->allocate(); } @@ -131,7 +145,7 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d Tensor *anchors_to_use = &_all_anchors; Tensor *deltas_to_use = &_deltas_flattened; - if(_is_qasymm8) + if (_is_qasymm8) { _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32)); _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32)); @@ -154,11 +168,12 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d anchors_to_use->allocator()->allocate(); _all_proposals_to_use = &_all_proposals; - if(_is_qasymm8) + if (_is_qasymm8) { _memory_group.manage(&_all_proposals_quantized); // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset - _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0))); + _all_proposals_quantized.allocator()->init( + TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0))); _quantize_all_proposals.configure(&_all_proposals, &_all_proposals_quantized); _all_proposals.allocator()->allocate(); _all_proposals_to_use = &_all_proposals_quantized; @@ -174,7 +189,8 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d // Note that NMS needs outputs preinitialized. auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo); - auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo); + auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, + rois_qinfo); auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32); // Initialize temporaries (unused) outputs @@ -187,17 +203,12 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d _memory_group.manage(&_proposals_4_roi_values); - const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height()); - _cpp_nms.configure(&_scores_flattened /*scores_in*/, - _all_proposals_to_use /*boxes_in,*/, - nullptr /* batch_splits_in*/, - scores_out /* scores_out*/, - &_proposals_4_roi_values /*boxes_out*/, - &_classes_nms_unused /*classes*/, - nullptr /*batch_splits_out*/, - &_keeps_nms_unused /*keeps*/, - num_valid_proposals /* keeps_size*/, - box_nms_info); + const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, + true, min_size_scaled, info.im_width(), info.im_height()); + _cpp_nms.configure(&_scores_flattened /*scores_in*/, _all_proposals_to_use /*boxes_in,*/, + nullptr /* batch_splits_in*/, scores_out /* scores_out*/, &_proposals_4_roi_values /*boxes_out*/, + &_classes_nms_unused /*classes*/, nullptr /*batch_splits_out*/, &_keeps_nms_unused /*keeps*/, + num_valid_proposals /* keeps_size*/, box_nms_info); _keeps_nms_unused.allocator()->allocate(); _classes_nms_unused.allocator()->allocate(); @@ -205,12 +216,17 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d _scores_flattened.allocator()->allocate(); // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images - _pad.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } }); + _pad.configure(&_proposals_4_roi_values, proposals, PaddingList{{1, 0}}); _proposals_4_roi_values.allocator()->allocate(); } -Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out, - const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info) +Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, + const ITensorInfo *deltas, + const ITensorInfo *anchors, + const ITensorInfo *proposals, + const ITensorInfo *scores_out, + const ITensorInfo *num_valid_proposals, + const GenerateProposalsInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32); @@ -218,9 +234,12 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas); - const int num_anchors = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL)); - const int feat_width = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH)); - const int feat_height = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT)); + const int num_anchors = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL)); + const int feat_width = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH)); + const int feat_height = + scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT)); const int num_images = scores->dimension(3); const int total_num_anchors = num_anchors * feat_width * feat_height; const int values_per_roi = info.values_per_roi(); @@ -229,76 +248,100 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1); - if(is_qasymm8) + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16); const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform(); ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f); } - TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()))); - - TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true); - TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true); - if(scores->data_layout() == DataLayout::NHWC) + TensorInfo all_anchors_info( + anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate( + anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()))); + + TensorInfo deltas_permuted_info = + deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)) + .set_is_resizable(true); + TensorInfo scores_permuted_info = + scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true); + if (scores->data_layout() == DataLayout::NHWC) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info); } else { - ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 })); - ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 })); + ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1})); + ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1})); } - TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + TensorInfo deltas_flattened_info( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info)); - TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true)); - TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + TensorInfo scores_flattened_info( + scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true)); + TensorInfo proposals_4_roi_values( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info)); TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values; - TensorInfo proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); - proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0)); - if(is_qasymm8) + TensorInfo proposals_4_roi_values_quantized( + deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true)); + proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16) + .set_quantization_info(QuantizationInfo(0.125f, 0)); + if (is_qasymm8) { - TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); + TensorInfo all_anchors_f32_info(anchors->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info)); - TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); - ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info)); - - TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32)); - ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info, - BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); - - ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized)); + TensorInfo deltas_flattened_f32_info(deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info)); + + TensorInfo proposals_4_roi_values_f32(deltas->clone() + ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)) + .set_is_resizable(true) + .set_data_type(DataType::F32)); + ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate( + &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info, + BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); + + ARM_COMPUTE_RETURN_ON_ERROR( + NEQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized)); proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized; } else { - ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, - BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEBoundingBoxTransform::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info, + BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f))); } - ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayer::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } })); + ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayer::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}})); - if(num_valid_proposals->total_size() > 0) + if (num_valid_proposals->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32); } - if(proposals->total_size() > 0) + if (proposals->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1); ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors)); - if(is_qasymm8) + if (is_qasymm8) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16); const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform(); @@ -311,7 +354,7 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens } } - if(scores_out->total_size() > 0) + if (scores_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors)); @@ -330,7 +373,7 @@ void NEGenerateProposalsLayer::run() NEScheduler::get().schedule(_compute_anchors.get(), Window::DimY); // Transpose and reshape the inputs - if(!_is_nhwc) + if (!_is_nhwc) { _permute_deltas.run(); _permute_scores.run(); @@ -339,7 +382,7 @@ void NEGenerateProposalsLayer::run() _flatten_deltas.run(); _flatten_scores.run(); - if(_is_qasymm8) + if (_is_qasymm8) { _dequantize_anchors.run(); _dequantize_deltas.run(); @@ -348,7 +391,7 @@ void NEGenerateProposalsLayer::run() // Build the boxes _bounding_box.run(); - if(_is_qasymm8) + if (_is_qasymm8) { _quantize_all_proposals.run(); } diff --git a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp index 822dcf491c..78218cbdee 100644 --- a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h" @@ -34,7 +35,13 @@ namespace arm_compute NEInstanceNormalizationLayer::~NEInstanceNormalizationLayer() = default; NEInstanceNormalizationLayer::NEInstanceNormalizationLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), _permute_input(), _permute_output(), _permuted_input(), _permuted_output() + : _memory_group(std::move(memory_manager)), + _normalization_kernel(), + _is_nchw(false), + _permute_input(), + _permute_output(), + _permuted_input(), + _permuted_output() { } @@ -43,14 +50,14 @@ void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, fl ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon); const DataLayout data_layout = input->info()->data_layout(); - const auto kernel_descriptor = InstanceNormalizationLayerKernelInfo{ gamma, beta, epsilon, true }; + const auto kernel_descriptor = InstanceNormalizationLayerKernelInfo{gamma, beta, epsilon, true}; // Configure Kernels _is_nchw = data_layout == DataLayout::NCHW; _normalization_kernel = std::make_unique(); - if(!_is_nchw) + if (!_is_nchw) { _memory_group.manage(&_permuted_input); _memory_group.manage(&_permuted_output); @@ -72,11 +79,12 @@ void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, fl } } -Status NEInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon) +Status NEInstanceNormalizationLayer::validate( + const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon) { - return NEInstanceNormalizationLayerKernel::validate(&input->clone()->set_data_layout(DataLayout::NCHW), - &output->clone()->set_data_layout(DataLayout::NCHW), - InstanceNormalizationLayerKernelInfo{ gamma, beta, epsilon, true }); + return NEInstanceNormalizationLayerKernel::validate( + &input->clone()->set_data_layout(DataLayout::NCHW), &output->clone()->set_data_layout(DataLayout::NCHW), + InstanceNormalizationLayerKernelInfo{gamma, beta, epsilon, true}); } void NEInstanceNormalizationLayer::run() @@ -84,7 +92,7 @@ void NEInstanceNormalizationLayer::run() MemoryGroupResourceScope scope_mg(_memory_group); // Permute input - if(!_is_nchw) + if (!_is_nchw) { _permute_input.run(); } @@ -92,7 +100,7 @@ void NEInstanceNormalizationLayer::run() NEScheduler::get().schedule(_normalization_kernel.get(), Window::DimZ); // Permute output - if(!_is_nchw) + if (!_is_nchw) { _permute_output.run(); } diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp index c3ecfb430f..b7f6203efd 100644 --- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp +++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEL2NormalizeLayerKernel.h" #include "src/core/NEON/kernels/NEReductionOperationKernel.h" @@ -69,7 +70,8 @@ Status NEL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo sum_sq.set_tensor_shape(shape); const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim); - ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE)); // Reduce shape on axis shape.set(actual_axis, 1); diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp index 428cdf8c04..1a08cdeb06 100644 --- a/src/runtime/NEON/functions/NELSTMLayer.cpp +++ b/src/runtime/NEON/functions/NELSTMLayer.cpp @@ -24,11 +24,12 @@ #include "arm_compute/runtime/NEON/functions/NELSTMLayer.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/common/LSTMParams.h" + #include "src/common/utils/Log.h" namespace arm_compute @@ -39,42 +40,122 @@ using namespace arm_compute::utils::info_helpers; NELSTMLayer::~NELSTMLayer() = default; NELSTMLayer::NELSTMLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(), - _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _transpose_cell_state(), - _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), - _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _projection_clip(), - _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(), - _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(), - _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(), - _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(), - _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(), - _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(), - _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false), + : _memory_group(std::move(memory_manager)), + _fully_connected_input_gate(), + _accum_input_gate1(), + _subtract_input_gate(), + _pixelwise_mul_input_gate(), + _activation_input_gate(), + _fully_connected_forget_gate(), + _accum_forget_gate1(), + _pixelwise_mul_forget_gate(), + _activation_forget_gate(), + _fully_connected_cell_state(), + _gemm_cell_state1(), + _transpose_cell_state(), + _accum_cell_state1(), + _accum_cell_state2(), + _pixelwise_mul_cell_state1(), + _activation_cell_state(), + _cell_clip(), + _pixelwise_mul_cell_state2(), + _fully_connected_output(), + _pixelwise_mul_output_state1(), + _accum_output1(), + _activation_output(), + _activation_output_state(), + _pixelwise_mul_output_state2(), + _fully_connected_output_state(), + _projection_clip(), + _copy_cell_state(), + _copy_output(), + _concat_scratch_buffer(), + _concat_inputs_forget_gate(), + _concat_weights_forget_gate(), + _concat_weights_input_gate(), + _concat_weights_output(), + _mean_std_norm_input_gate(), + _pixelwise_mul_input_gate_coeff(), + _accum_input_gate_bias(), + _mean_std_norm_forget_gate(), + _pixelwise_mul_forget_gate_coeff(), + _accum_forget_gate_bias(), + _mean_std_norm_cell_gate(), + _pixelwise_mul_cell_gate_coeff(), + _accum_cell_gate_bias(), + _mean_std_norm_output_gate(), + _pixelwise_mul_output_gate_coeff(), + _accum_output_gate_bias(), + _input_gate_out1(), + _input_gate_out2(), + _input_gate_out3(), + _input_gate_out4(), + _forget_gate_out1(), + _forget_gate_out2(), + _forget_gate_out3(), + _forget_gate_out4(), + _forget_gate_out5(), + _forget_gate_out6(), + _cell_state_out1(), + _cell_state_out2(), + _cell_state_out3(), + _cell_state_out4(), + _cell_state_out5(), + _output1(), + _output2(), + _output3(), + _output4(), + _cell_state_activation(), + _output_state1(), + _ones(), + _input_layer_norm_out1(), + _input_layer_norm_out2(), + _forget_layer_norm_out1(), + _forget_layer_norm_out2(), + _cell_layer_norm_out1(), + _cell_layer_norm_out2(), + _output_layer_norm_out1(), + _output_layer_norm_out2(), + _run_peephole_opt(false), + _run_cifg_opt(false), + _perform_cell_clipping(false), + _has_projection_weights(false), + _perform_projection_clipping(false), + _is_prepared(false), _is_layer_norm_lstm(false) { } -void NELSTMLayer::configure(const ITensor *input, - const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights, - const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights, - const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias, - const ITensor *output_state_in, const ITensor *cell_state_in, - ITensor *scratch_buffer, ITensor *output_state_out, ITensor *cell_state_out, ITensor *output, - const LSTMParams &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +void NELSTMLayer::configure(const ITensor *input, + const ITensor *input_to_forget_weights, + const ITensor *input_to_cell_weights, + const ITensor *input_to_output_weights, + const ITensor *recurrent_to_forget_weights, + const ITensor *recurrent_to_cell_weights, + const ITensor *recurrent_to_output_weights, + const ITensor *forget_gate_bias, + const ITensor *cell_bias, + const ITensor *output_gate_bias, + const ITensor *output_state_in, + const ITensor *cell_state_in, + ITensor *scratch_buffer, + ITensor *output_state_out, + ITensor *cell_state_out, + ITensor *output, + const LSTMParams &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, + forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); - ARM_COMPUTE_LOG_PARAMS(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output, - lstm_params, activation_info, cell_threshold, projection_threshold); + forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, + scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info, + cell_threshold, projection_threshold); _is_layer_norm_lstm = lstm_params.use_layer_norm(); @@ -83,13 +164,12 @@ void NELSTMLayer::configure(const ITensor *input, build_lstm_params_tensor_info(lstm_params, &lstm_params_info); // Validate - ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate(input->info(), input_to_forget_weights->info(), - input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - output_state_in->info(), cell_state_in->info(), - scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(), - lstm_params_info, activation_info, cell_threshold, projection_threshold)); + ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate( + input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), + recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(), + cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(), + lstm_params_info, activation_info, cell_threshold, projection_threshold)); const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape(); @@ -116,20 +196,23 @@ void NELSTMLayer::configure(const ITensor *input, _concat_weights_forget_gate.configure(weights_vector, &_forget_gate_out6, Window::DimX); _memory_group.manage(&_forget_gate_out5); - _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); + _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, + (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5); _memory_group.manage(&_forget_gate_out1); _memory_group.manage(&_forget_gate_out3); _forget_gate_out6.allocator()->allocate(); Tensor *forget_gate_out = &_forget_gate_out5; - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _run_peephole_opt = true; _memory_group.manage(&_forget_gate_out4); - _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE); + _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, + ConvertPolicy::SATURATE); _forget_gate_out4.allocator()->allocate(); _forget_gate_out5.allocator()->allocate(); forget_gate_out = &_forget_gate_out3; @@ -138,21 +221,25 @@ void NELSTMLayer::configure(const ITensor *input, { _forget_gate_out3.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_forget_layer_norm_out1); _memory_group.manage(&_forget_layer_norm_out2); _mean_std_norm_forget_gate.configure(forget_gate_out); - _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(), + &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before forget_gate_out->allocator()->allocate(); - _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, + ConvertPolicy::SATURATE); _forget_layer_norm_out1.allocator()->allocate(); forget_gate_out = &_forget_layer_norm_out2; } - _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_forget_gate.configure(forget_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); // Configure block that calculates the input gate // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG @@ -161,7 +248,7 @@ void NELSTMLayer::configure(const ITensor *input, // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); Tensor *input_gate_out = &_input_gate_out1; - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { _memory_group.manage(&_input_gate_out1); _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); @@ -183,15 +270,19 @@ void NELSTMLayer::configure(const ITensor *input, _memory_group.manage(&_input_gate_out1); _memory_group.manage(&_input_gate_out4); - _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3); + _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, + (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), + &_input_gate_out3); _input_gate_out2.allocator()->allocate(); input_gate_out = &_input_gate_out3; - if(_run_peephole_opt) + if (_run_peephole_opt) { _memory_group.manage(&_input_gate_out4); - _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE); + _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, + ConvertPolicy::SATURATE); _input_gate_out3.allocator()->allocate(); _input_gate_out4.allocator()->allocate(); input_gate_out = &_input_gate_out1; @@ -201,21 +292,25 @@ void NELSTMLayer::configure(const ITensor *input, _input_gate_out1.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_input_layer_norm_out1); _memory_group.manage(&_input_layer_norm_out2); _mean_std_norm_input_gate.configure(input_gate_out); - _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(), + &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before input_gate_out->allocator()->allocate(); - _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(), + &_input_layer_norm_out2, ConvertPolicy::SATURATE); _input_layer_norm_out1.allocator()->allocate(); input_gate_out = &_input_layer_norm_out2; } - _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_input_gate.configure(input_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); } // Configure block that calculates the cell state @@ -228,7 +323,8 @@ void NELSTMLayer::configure(const ITensor *input, _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_state_out1); - _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1); + _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, + &_cell_state_out1); _memory_group.manage(&_cell_state_out2); _transpose_cell_state.configure(recurrent_to_cell_weights, &_cell_state_out2); _memory_group.manage(&_cell_state_out3); @@ -237,33 +333,40 @@ void NELSTMLayer::configure(const ITensor *input, _memory_group.manage(&_cell_state_out4); _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE); Tensor *cell_state_out_ptr = &_cell_state_out4; - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_cell_layer_norm_out1); _memory_group.manage(&_cell_layer_norm_out2); _mean_std_norm_cell_gate.configure(cell_state_out_ptr); - _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), + &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before cell_state_out_ptr->allocator()->allocate(); - _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, + ConvertPolicy::SATURATE); _cell_layer_norm_out1.allocator()->allocate(); cell_state_out_ptr = &_cell_layer_norm_out2; } _activation_cell_state.configure(cell_state_out_ptr, nullptr, activation_info); _memory_group.manage(&_cell_state_out5); - _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); cell_state_out_ptr->allocator()->allocate(); - _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE); _cell_state_out3.allocator()->allocate(); _cell_state_out5.allocator()->allocate(); // Perform clipping - if(cell_threshold != 0.f) + if (cell_threshold != 0.f) { _perform_cell_clipping = true; - _cell_clip.configure(&_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold, -cell_threshold)); + _cell_clip.configure(&_cell_state_out1, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + cell_threshold, -cell_threshold)); } // Configure block that calculates the output @@ -281,18 +384,20 @@ void NELSTMLayer::configure(const ITensor *input, _memory_group.manage(&_output1); _memory_group.manage(&_output4); - _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4); + _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, + &_output4); _output2.allocator()->allocate(); _forget_gate_out2.allocator()->allocate(); Tensor *output_gate_out = &_output4; - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type())); _memory_group.manage(&_output3); - _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _accum_output1.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE); _output4.allocator()->allocate(); output_gate_out = &_output1; @@ -304,21 +409,25 @@ void NELSTMLayer::configure(const ITensor *input, { _output1.allocator()->allocate(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _memory_group.manage(&_output_layer_norm_out1); _memory_group.manage(&_output_layer_norm_out2); _mean_std_norm_output_gate.configure(output_gate_out); - _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(), + &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before output_gate_out->allocator()->allocate(); - _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE); + _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, + ConvertPolicy::SATURATE); _output_layer_norm_out1.allocator()->allocate(); output_gate_out = &_output_layer_norm_out2; } - _activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _activation_output.configure(output_gate_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); // Configure block that calculates the output state /** lstm_res = PixelwiseMul(output, Activation(cell_state)) @@ -335,20 +444,24 @@ void NELSTMLayer::configure(const ITensor *input, _memory_group.manage(&_cell_state_activation); _activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info); - _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); _cell_state_activation.allocator()->allocate(); output_gate_out->allocator()->allocate(); - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { _has_projection_weights = true; - _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out); + _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), + lstm_params.projection_bias(), output_state_out); _output_state1.allocator()->allocate(); // Perform clipping - if(projection_threshold != 0.f) + if (projection_threshold != 0.f) { _perform_projection_clipping = true; - _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)); + _projection_clip.configure(output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -projection_threshold, projection_threshold)); } } @@ -358,7 +471,7 @@ void NELSTMLayer::configure(const ITensor *input, // Vector for holding the tensors to store in scratch buffer std::vector scratch_inputs; - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { scratch_inputs.emplace_back(input_gate_out); } @@ -372,29 +485,38 @@ void NELSTMLayer::configure(const ITensor *input, output_gate_out->allocator()->allocate(); } -Status NELSTMLayer::validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in, - const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output, - const LSTMParams &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold) +Status NELSTMLayer::validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_in, + const ITensorInfo *scratch_buffer, + const ITensorInfo *output_state_out, + const ITensorInfo *cell_state_out, + const ITensorInfo *output, + const LSTMParams &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold, + float projection_threshold) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR( + input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); // Check data types ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, - input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, - output_state_in, cell_state_in, - scratch_buffer, output_state_out, cell_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES( + input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output); // Check dimensions ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2); @@ -413,16 +535,16 @@ Status NELSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) - && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) && + cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0)); const unsigned int num_batches = input->dimension(1); const unsigned int num_cells = input_to_output_weights->dimension(1); - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { // If CIFG is used, input layer normalization weights tensor is omitted - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr); } @@ -434,8 +556,12 @@ Status NELSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights()); } - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), + lstm_params.cell_layer_norm_weights(), + lstm_params.output_layer_norm_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), + lstm_params.cell_layer_norm_weights(), + lstm_params.output_layer_norm_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1); @@ -445,7 +571,7 @@ Status NELSTMLayer::validate(const ITensorInfo *input, } // Check peephole optimization - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1); @@ -465,33 +591,39 @@ Status NELSTMLayer::validate(const ITensorInfo *input, std::vector inputs_vector; inputs_vector.emplace_back(input); inputs_vector.emplace_back(output_state_in); - const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); + const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); TensorInfo forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX)); // Validate forget gate - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate)); + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate( + input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&forget_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Validate input gate - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), - lstm_params.recurrent_to_input_weights(), - lstm_params.input_gate_bias()); + lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1); @@ -499,88 +631,120 @@ Status NELSTMLayer::validate(const ITensorInfo *input, std::vector lstm_weights; lstm_weights.emplace_back(lstm_params.input_to_input_weights()); lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights()); - TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); - TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); + TensorShape lstm_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); + TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX)); - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate)); + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate( + input, lstm_params.input_to_input_weights(), + (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&input_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), + &input_gate, ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); } else { - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); } // Validate cell state - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo())); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); - if(lstm_params.use_layer_norm()) + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate( + input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo())); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&cell_state_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE)); } ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, activation_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); - if(cell_threshold != 0.f) + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE)); + if (cell_threshold != 0.f) { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold, - -cell_threshold))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&cell_state_tmp, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + cell_threshold, -cell_threshold))); } // Validate output gate tmp std::vector in_out_weights; in_out_weights.emplace_back(input_to_output_weights); in_out_weights.emplace_back(recurrent_to_output_weights); - TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); - TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); + TensorShape in_out_weights_concat_shape = + arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); + TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX)); - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp)); + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate( + input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, + 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, + ConvertPolicy::SATURATE)); } - if(lstm_params.use_layer_norm()) + if (lstm_params.use_layer_norm()) { ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&output_gate_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), + &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, + ConvertPolicy::SATURATE)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Validate output state ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - if(lstm_params.has_projection()) + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out)); - if(projection_threshold != 0.f) + ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), + lstm_params.projection_bias(), output_state_out)); + if (projection_threshold != 0.f) { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output_state_out, output_state_out, - ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + output_state_out, output_state_out, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, + projection_threshold))); } } @@ -590,7 +754,7 @@ Status NELSTMLayer::validate(const ITensorInfo *input, // Validate scratch concatenation std::vector inputs_vector_info_raw; - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { inputs_vector_info_raw.push_back(&input_gate); } @@ -611,12 +775,12 @@ void NELSTMLayer::run() _concat_inputs_forget_gate.run(); _fully_connected_forget_gate.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_forget_gate.run(); _accum_forget_gate1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_forget_gate.run(); _pixelwise_mul_forget_gate_coeff.run(); @@ -624,15 +788,17 @@ void NELSTMLayer::run() } _activation_forget_gate.run(); - if(_run_cifg_opt) + if (_run_cifg_opt) { - if(_ones.info()->data_type() == DataType::F16) + if (_ones.info()->data_type() == DataType::F16) { - std::fill_n(reinterpret_cast(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1); + std::fill_n(reinterpret_cast(_ones.buffer()), + _ones.info()->total_size() / _ones.info()->element_size(), 1); } else { - std::fill_n(reinterpret_cast(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1); + std::fill_n(reinterpret_cast(_ones.buffer()), + _ones.info()->total_size() / _ones.info()->element_size(), 1); } _subtract_input_gate.run(); } @@ -640,13 +806,13 @@ void NELSTMLayer::run() { _fully_connected_input_gate.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_input_gate.run(); _accum_input_gate1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_input_gate.run(); _pixelwise_mul_input_gate_coeff.run(); @@ -659,7 +825,7 @@ void NELSTMLayer::run() _transpose_cell_state.run(); _gemm_cell_state1.run(); _accum_cell_state1.run(); - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_cell_gate.run(); _pixelwise_mul_cell_gate_coeff.run(); @@ -671,18 +837,18 @@ void NELSTMLayer::run() _pixelwise_mul_cell_state2.run(); _accum_cell_state2.run(); - if(_perform_cell_clipping) + if (_perform_cell_clipping) { _cell_clip.run(); } _fully_connected_output.run(); - if(_run_peephole_opt) + if (_run_peephole_opt) { _pixelwise_mul_output_state1.run(); _accum_output1.run(); } - if(_is_layer_norm_lstm) + if (_is_layer_norm_lstm) { _mean_std_norm_output_gate.run(); _pixelwise_mul_output_gate_coeff.run(); @@ -693,10 +859,10 @@ void NELSTMLayer::run() _activation_output_state.run(); _pixelwise_mul_output_state2.run(); - if(_has_projection_weights) + if (_has_projection_weights) { _fully_connected_output_state.run(); - if(_perform_projection_clipping) + if (_perform_projection_clipping) { _projection_clip.run(); } @@ -710,10 +876,10 @@ void NELSTMLayer::run() void NELSTMLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _concat_weights_forget_gate.run(); - if(!_run_cifg_opt) + if (!_run_cifg_opt) { _concat_weights_input_gate.run(); } diff --git a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp index cfdeb000e0..41f9c3d700 100644 --- a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp +++ b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp @@ -24,8 +24,9 @@ #include "arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" @@ -46,36 +47,104 @@ const QuantizationInfo qsymm_0(1.f / 32768.f, 0); // qsymm16 with 0 integer bit NELSTMLayerQuantized::~NELSTMLayerQuantized() = default; NELSTMLayerQuantized::NELSTMLayerQuantized(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(), - _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add1(), _add2(), _mul1(), _mul2(), _mul3(), - _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(), _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr), - _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr), _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr), - _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr), _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(), - _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(), _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(), - _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state1(), _cell_state2(), _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(), + : _memory_group(std::move(memory_manager)), + _gemmlowp(), + _output_stage(), + _transpose_weights(), + _concat_input_weights(), + _concat_recurrent_weights(), + _concat_weights(), + _concat_inputs(), + _concat_bias(), + _sigmoid_forget_gate(), + _sigmoid_input_gate(), + _sigmoid_output_gate(), + _tanh_modulation_gate(), + _tanh_output_state(), + _add1(), + _add2(), + _mul1(), + _mul2(), + _mul3(), + _slice_input_tensor(), + _slice_forget_tensor(), + _slice_cell_tensor(), + _slice_output_tensor(), + _dequantize(), + _quantize(), + _input_to_input_weights(nullptr), + _input_to_forget_weights(nullptr), + _input_to_cell_weights(nullptr), + _input_to_output_weights(nullptr), + _recurrent_to_input_weights(nullptr), + _recurrent_to_forget_weights(nullptr), + _recurrent_to_cell_weights(nullptr), + _recurrent_to_output_weights(nullptr), + _input_gate_bias(nullptr), + _forget_gate_bias(nullptr), + _cell_bias(nullptr), + _output_gate_bias(nullptr), + _recurrent_weights(), + _input_weights(), + _weights(), + _input(), + _weights_transposed(), + _output_highp(), + _output_lowp(), + _bias(), + _forget_gate_input(), + _input_gate_input(), + _output_gate_input(), + _input_modulation_gate_input(), + _forget_gate_output(), + _input_gate_output(), + _output_gate_output(), + _input_modulation_gate_output(), + _cell_state1(), + _cell_state2(), + _output_state_tmp(), + _output_state_out_symm(), + _output_state_out_f32(), _is_prepared(false) { } void NELSTMLayerQuantized::configure(const ITensor *input, - const ITensor *input_to_input_weights, const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights, - const ITensor *recurrent_to_input_weights, const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights, - const ITensor *input_gate_bias, const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias, - ITensor *cell_state_in, const ITensor *output_state_in, - ITensor *cell_state_out, ITensor *output_state_out) + const ITensor *input_to_input_weights, + const ITensor *input_to_forget_weights, + const ITensor *input_to_cell_weights, + const ITensor *input_to_output_weights, + const ITensor *recurrent_to_input_weights, + const ITensor *recurrent_to_forget_weights, + const ITensor *recurrent_to_cell_weights, + const ITensor *recurrent_to_output_weights, + const ITensor *input_gate_bias, + const ITensor *forget_gate_bias, + const ITensor *cell_bias, + const ITensor *output_gate_bias, + ITensor *cell_state_in, + const ITensor *output_state_in, + ITensor *cell_state_out, + ITensor *output_state_out) { - ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); - - ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), - input_to_output_weights->info(), - recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info())); - - ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, + forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, + cell_state_out, output_state_out); + + ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate( + input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), + input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), + recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(), + forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), + output_state_in->info(), cell_state_out->info(), output_state_out->info())); + + ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, + cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, + output_state_out); const int input_size = input->info()->dimension(0); const int batch_size = input->info()->dimension(1); @@ -83,8 +152,10 @@ void NELSTMLayerQuantized::configure(const ITensor *input, const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization - auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4)); - auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm)); + auto_init_if_empty(*cell_state_out->info(), + TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4)); + auto_init_if_empty(*output_state_out->info(), + TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm)); _input_to_input_weights = input_to_input_weights; _input_to_forget_weights = input_to_forget_weights; @@ -100,34 +171,41 @@ void NELSTMLayerQuantized::configure(const ITensor *input, _output_gate_bias = output_gate_bias; // Weights concatenation - std::vector inputs_weights_vector{ input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights }; - std::vector recurrent_weights_vector{ recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights }; + std::vector inputs_weights_vector{input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights}; + std::vector recurrent_weights_vector{recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights}; - _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _input_weights.allocator()->init( + TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_input_weights.configure(inputs_weights_vector, &_input_weights, Window::DimY); - _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + _recurrent_weights.allocator()->init( + TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_recurrent_weights.configure(recurrent_weights_vector, &_recurrent_weights, Window::DimY); - std::vector weights_vector{ &_recurrent_weights, &_input_weights }; - _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); + std::vector weights_vector{&_recurrent_weights, &_input_weights}; + _weights.allocator()->init( + TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights)); _concat_weights.configure(weights_vector, &_weights, Window::DimX); _transpose_weights.configure(&_weights, &_weights_transposed); // Input concatenation - std::vector input_vector{ input, output_state_in }; + std::vector input_vector{input, output_state_in}; _memory_group.manage(&_input); - _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm)); + _input.allocator()->init( + TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm)); _concat_inputs.configure(input_vector, &_input, Window::DimX); // Bias concatenation - std::vector bias_vector{ input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias }; + std::vector bias_vector{input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias}; _bias.allocator()->init(TensorInfo(TensorShape(4 * output_size), 1, DataType::S32)); _concat_bias.configure(bias_vector, &_bias, Window::DimX); // Invert the offset for gemmlowp _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset)); - _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); + _weights_transposed.info()->set_quantization_info( + QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset)); // Run gemmlowp _memory_group.manage(&_output_highp); @@ -137,7 +215,8 @@ void NELSTMLayerQuantized::configure(const ITensor *input, // Set the offset back _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); - _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); + _weights_transposed.info()->set_quantization_info( + QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset)); // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12)) _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3)); @@ -159,64 +238,80 @@ void NELSTMLayerQuantized::configure(const ITensor *input, _bias.allocator()->allocate(); // Get the gate tensors - if(batch_size > 1) + if (batch_size > 1) { _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size }); + _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, {0, 0}, {output_size, batch_size}); _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }); + _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, {output_size, 0}, + {2 * output_size, batch_size}); _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }); + _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, {2 * output_size, 0}, + {3 * output_size, batch_size}); _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }); + _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, {3 * output_size, 0}, + {4 * output_size, batch_size}); _output_lowp.allocator()->allocate(); } else { _memory_group.manage(&_input_gate_input); - _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0 }, { output_size }); + _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, {0}, {output_size}); _memory_group.manage(&_forget_gate_input); - _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size }); + _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, {output_size}, {2 * output_size}); _memory_group.manage(&_input_modulation_gate_input); - _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }); + _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, {2 * output_size}, + {3 * output_size}); _memory_group.manage(&_output_gate_input); - _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size }); + _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, {3 * output_size}, {4 * output_size}); _output_lowp.allocator()->allocate(); } // Forget gate _memory_group.manage(&_forget_gate_output); - _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _forget_gate_output.allocator()->init( + TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _forget_gate_input.allocator()->allocate(); // Input gate _memory_group.manage(&_input_gate_output); - _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _input_gate_output.allocator()->init( + TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _input_gate_input.allocator()->allocate(); // Input modulation gate equation _memory_group.manage(&_input_modulation_gate_output); - _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _input_modulation_gate_output.allocator()->init( + TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); _input_modulation_gate_input.allocator()->allocate(); // Output gate _memory_group.manage(&_output_gate_output); - _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _output_gate_output.allocator()->init( + TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); _output_gate_input.allocator()->allocate(); // Long term memory _memory_group.manage(&_cell_state1); - _cell_state1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); - _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _cell_state1.allocator()->init( + TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); + _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _forget_gate_output.allocator()->allocate(); _memory_group.manage(&_cell_state2); - _cell_state2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); - _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _cell_state2.allocator()->init( + TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4)); + _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _input_modulation_gate_output.allocator()->allocate(); _input_gate_output.allocator()->allocate(); @@ -226,18 +321,23 @@ void NELSTMLayerQuantized::configure(const ITensor *input, // Short term memory _memory_group.manage(&_output_state_tmp); - _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _tanh_output_state.configure(cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); + _output_state_tmp.allocator()->init( + TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _tanh_output_state.configure(cell_state_out, &_output_state_tmp, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)); _memory_group.manage(&_output_state_out_symm); - _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); - _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _output_state_out_symm.allocator()->init( + TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0)); + _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _output_gate_output.allocator()->allocate(); _output_state_tmp.allocator()->allocate(); // Requantize the output state from QSYMM16 to QASYMM8 _memory_group.manage(&_output_state_out_f32); - _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32)); + _output_state_out_f32.allocator()->init( + TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32)); _dequantize.configure(&_output_state_out_symm, &_output_state_out_f32); _output_state_out_symm.allocator()->allocate(); @@ -246,15 +346,28 @@ void NELSTMLayerQuantized::configure(const ITensor *input, } Status NELSTMLayerQuantized::validate(const ITensorInfo *input, - const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out) + const ITensorInfo *input_to_input_weights, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_input_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *input_gate_bias, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, - output_state_in, cell_state_out, output_state_out); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR( + input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, + input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, + output_state_out); const int input_size = input->dimension(0); const int batch_size = input->dimension(1); @@ -266,29 +379,51 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2); - TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8)); - TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8)); - TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32)); - TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm)); - TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4)); + TensorInfo input_weights_info(input_to_input_weights->clone() + ->set_tensor_shape(TensorShape(input_size, output_size)) + .set_data_type(DataType::QASYMM8)); + TensorInfo recurrent_weights_info(input_to_input_weights->clone() + ->set_tensor_shape(TensorShape(output_size, output_size)) + .set_data_type(DataType::QASYMM8)); + TensorInfo bias_info( + input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32)); + TensorInfo output_state_info(cell_state_in->clone() + ->set_tensor_shape(TensorShape(output_size, batch_size)) + .set_data_type(DataType::QASYMM8) + .set_quantization_info(qasymm)); + TensorInfo cell_state_info(cell_state_in->clone() + ->set_tensor_shape(TensorShape(output_size, batch_size)) + .set_data_type(DataType::QSYMM16) + .set_quantization_info(qsymm_4)); // Shape checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, + input_to_cell_weights, input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in); // Data type checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, + input_to_forget_weights, input_to_cell_weights, + input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, + output_gate_bias); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in); // Quantization checks - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights, + input_to_forget_weights, input_to_cell_weights, + input_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in); @@ -310,7 +445,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, recurrent_weights_vector.emplace_back(recurrent_to_cell_weights); recurrent_weights_vector.emplace_back(recurrent_to_output_weights); const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights); - ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY)); // _concat_weights std::vector weights_vector; @@ -320,7 +456,7 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(weights_vector, &weights, Window::DimX)); // _transpose_weights const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]); - TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape); + TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape); ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(&weights, &weights_transposed)); // _concat_inputs @@ -346,7 +482,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, // _gemmlowp const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp)); // Set the offset back input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset)); @@ -357,7 +494,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale; int32_t output_multiplier = 0; int32_t output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); // _output_stage GEMMLowpOutputStageInfo info; @@ -372,68 +510,91 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, TensorInfo input_modulation_gate_input; TensorInfo output_gate_input; - if(batch_size > 1) + if (batch_size > 1) { // _slice_input_tensor input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size})); // _slice_forget_tensor forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size})); // _slice_cell_tensor input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0}, + {3 * output_size, batch_size})); // _slice_output_tensor output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size})); } else { // _slice_input_tensor input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size })); + ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, {0}, {output_size})); // _slice_forget_tensor forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size})); // _slice_cell_tensor input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size})); // _slice_output_tensor output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3); - ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size })); + ARM_COMPUTE_RETURN_ON_ERROR( + NESlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size})); } // _sigmoid_forget_gate const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&forget_gate_input, &forget_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _sigmoid_input_gate const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _tanh_modulation_gate - const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, + qsymm_0); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); // _sigmoid_output_gate const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&output_gate_input, &output_gate_output, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // _mul_forget_gate_cell_state const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); // _mul_input_gate_input_mod_gate const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, + &cell_state_tmp2, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); // _add_cell_state_tmps - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE)); // _tanh_modulation_gate const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(cell_state_out, &output_state_tmp, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f))); // _mul_output_state_tmp_output_gate const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, + &output_state_out_symm, 1, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); // _dequantize const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32); @@ -442,14 +603,14 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input, // _quantize ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&output_state_out_f32, output_state_out)); - if(cell_state_out->total_size() != 0) + if (cell_state_out->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out); } - if(output_state_out->total_size() != 0) + if (output_state_out->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out); @@ -508,7 +669,7 @@ void NELSTMLayerQuantized::run() void NELSTMLayerQuantized::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _input_weights.allocator()->allocate(); _concat_input_weights.run(); diff --git a/src/runtime/NEON/functions/NELogical.cpp b/src/runtime/NEON/functions/NELogical.cpp index 92dcf15791..0013a521d1 100644 --- a/src/runtime/NEON/functions/NELogical.cpp +++ b/src/runtime/NEON/functions/NELogical.cpp @@ -25,6 +25,7 @@ #include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/Tensor.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NELogicalKernel.h" @@ -32,15 +33,14 @@ namespace arm_compute { struct LogicalArgs { - std::unique_ptr kernel{ nullptr }; + std::unique_ptr kernel{nullptr}; ITensorPack pack{}; }; struct NELogicalAnd::Impl : public LogicalArgs { }; -NELogicalAnd::NELogicalAnd() - : _impl(std::make_unique()) +NELogicalAnd::NELogicalAnd() : _impl(std::make_unique()) { } NELogicalAnd::~NELogicalAnd() = default; @@ -72,8 +72,7 @@ void NELogicalAnd::run() struct NELogicalOr::Impl : public LogicalArgs { }; -NELogicalOr::NELogicalOr() - : _impl(std::make_unique()) +NELogicalOr::NELogicalOr() : _impl(std::make_unique()) { } NELogicalOr::~NELogicalOr() = default; @@ -105,8 +104,7 @@ void NELogicalOr::run() struct NELogicalNot::Impl : public LogicalArgs { }; -NELogicalNot::NELogicalNot() - : _impl(std::make_unique()) +NELogicalNot::NELogicalNot() : _impl(std::make_unique()) { } NELogicalNot::~NELogicalNot() = default; diff --git a/src/runtime/NEON/functions/NEMatMul.cpp b/src/runtime/NEON/functions/NEMatMul.cpp index 58640f40ea..31898bafc4 100644 --- a/src/runtime/NEON/functions/NEMatMul.cpp +++ b/src/runtime/NEON/functions/NEMatMul.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuMatMul.h" @@ -33,23 +34,27 @@ namespace arm_compute { struct NEMatMul::Impl { - const ITensor *lhs{ nullptr }; - const ITensor *rhs{ nullptr }; - ITensor *output{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *lhs{nullptr}; + const ITensor *rhs{nullptr}; + ITensor *output{nullptr}; + std::unique_ptr op{nullptr}; MemoryGroup memory_group{}; WorkspaceData workspace_tensors{}; ITensorPack run_pack{}; }; -NEMatMul::NEMatMul() - : _impl(std::make_unique()) +NEMatMul::NEMatMul() : _impl(std::make_unique()) { } NEMatMul::~NEMatMul() = default; -void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info) +void NEMatMul::configure(ITensor *lhs, + ITensor *rhs, + ITensor *output, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) { _impl->lhs = lhs; _impl->rhs = rhs; @@ -58,11 +63,16 @@ void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatM ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->lhs, _impl->rhs, _impl->output); _impl->op = std::make_unique(); _impl->op->configure(lhs->info(), rhs->info(), output->info(), info, settings, act_info); - _impl->run_pack = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } }; + _impl->run_pack = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}}; _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } -Status NEMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info) +Status NEMatMul::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *output, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) { return cpu::CpuMatMul::validate(lhs, rhs, output, info, settings, act_info); } diff --git a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp index 97ddaea41d..c3861afd2c 100644 --- a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp +++ b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp @@ -25,8 +25,9 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/NEON/functions/NEFill.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h" #include "src/cpu/operators/CpuMaxUnpooling.h" @@ -35,20 +36,22 @@ namespace arm_compute { struct NEMaxUnpoolingLayer::Impl { - const ITensor *src{ nullptr }; - const ITensor *indices{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + const ITensor *indices{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; NEMaxUnpoolingLayer::~NEMaxUnpoolingLayer() = default; -NEMaxUnpoolingLayer::NEMaxUnpoolingLayer() - : _fill_func(), _impl() +NEMaxUnpoolingLayer::NEMaxUnpoolingLayer() : _fill_func(), _impl() { } -void NEMaxUnpoolingLayer::configure(ITensor *input, ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info) +void NEMaxUnpoolingLayer::configure(ITensor *input, + ITensor *indices, + ITensor *output, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info); @@ -64,7 +67,10 @@ void NEMaxUnpoolingLayer::configure(ITensor *input, ITensor *indices, ITensor *o _impl->op->configure(input->info(), indices->info(), output->info(), pool_info); } -Status NEMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info) +Status NEMaxUnpoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *indices, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices); ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuMaxUnpooling::validate(input, indices, output, pool_info)); diff --git a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp index 7626aa0db2..dec0dde56d 100644 --- a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h" -#include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h" namespace arm_compute { diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp index d3b1696335..d6d2e9dc46 100644 --- a/src/runtime/NEON/functions/NENormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NENormalizationLayerKernel.h" @@ -61,13 +62,16 @@ void NENormalizationLayer::configure(const ITensor *input, ITensor *output, cons _input_squared.allocator()->allocate(); } -Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info) +Status NENormalizationLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const NormalizationLayerInfo &norm_info) { // Perform validation step ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ON_ERROR(NENormalizationLayerKernel::validate(input, input, output, norm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); return Status{}; } @@ -78,4 +82,4 @@ void NENormalizationLayer::run() _multiply_f.run(); NEScheduler::get().schedule(_norm_kernel.get(), Window::DimY); } -} \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEPReluLayer.cpp b/src/runtime/NEON/functions/NEPReluLayer.cpp index 80c5690a4e..963e68bac7 100644 --- a/src/runtime/NEON/functions/NEPReluLayer.cpp +++ b/src/runtime/NEON/functions/NEPReluLayer.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEPReluLayer.h" #include "arm_compute/core/ITensor.h" + #include "src/cpu/operators/CpuPRelu.h" namespace arm_compute @@ -32,17 +33,16 @@ using OperatorType = cpu::CpuPRelu; struct NEPReluLayer::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEPReluLayer::NEPReluLayer() - : _impl(std::make_unique()) +NEPReluLayer::NEPReluLayer() : _impl(std::make_unique()) { } -NEPReluLayer::NEPReluLayer(NEPReluLayer &&) = default; +NEPReluLayer::NEPReluLayer(NEPReluLayer &&) = default; NEPReluLayer &NEPReluLayer::operator=(NEPReluLayer &&) = default; NEPReluLayer::~NEPReluLayer() = default; diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp index 8bacdd3002..253566df0f 100644 --- a/src/runtime/NEON/functions/NEPadLayer.cpp +++ b/src/runtime/NEON/functions/NEPadLayer.cpp @@ -23,13 +23,13 @@ */ #include "arm_compute/runtime/NEON/functions/NEPadLayer.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" -#include "src/core/NEON/kernels/NEPadLayerKernel.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/core/NEON/kernels/NEPadLayerKernel.h" namespace arm_compute { @@ -38,9 +38,9 @@ namespace uint32_t last_padding_dimension(const PaddingList &padding) { int last_padding_dim = padding.size() - 1; - for(; last_padding_dim >= 0; --last_padding_dim) + for (; last_padding_dim >= 0; --last_padding_dim) { - if(padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0) + if (padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0) { break; } @@ -52,11 +52,22 @@ uint32_t last_padding_dimension(const PaddingList &padding) NEPadLayer::~NEPadLayer() = default; NEPadLayer::NEPadLayer() - : _copy_function(), _pad_kernel(), _mode(), _padding(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results() + : _copy_function(), + _pad_kernel(), + _mode(), + _padding(), + _num_dimensions(0), + _slice_functions(), + _concat_functions(), + _slice_results(), + _concat_results() { } -void NEPadLayer::configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value) +void NEPadLayer::configure_constant_mode(ITensor *input, + ITensor *output, + const PaddingList &padding, + const PixelValue constant_value) { _pad_kernel = std::make_unique(); _pad_kernel->configure(input, output, padding, constant_value, PaddingMode::CONSTANT); @@ -85,20 +96,20 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu Coordinates ends_after{}; Coordinates strides{}; ITensor *prev = input; - for(uint32_t i = 0; i < _num_dimensions; ++i) + for (uint32_t i = 0; i < _num_dimensions; ++i) { // Values in strides from the previous dimensions need to be set to 1 to avoid reversing again. - if(i > 0) + if (i > 0) { strides.set(i - 1, 1); } - if(_padding[i].first > 0 || _padding[i].second > 0) + if (_padding[i].first > 0 || _padding[i].second > 0) { // Set the starts, ends, and strides values for the current dimension. // Due to the bit masks passed to strided slice, the values below the current dimension in // starts and ends will be ignored so do not need to be modified. - if(_mode == PaddingMode::REFLECT) + if (_mode == PaddingMode::REFLECT) { starts_before.set(i, _padding[i].first); ends_before.set(i, 0); @@ -124,11 +135,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu // Reflect the input values for the padding before and after the input. std::vector concat_vector; - if(_padding[i].first > 0) + if (_padding[i].first > 0) { - if(i < prev->info()->num_dimensions()) + if (i < prev->info()->num_dimensions()) { - _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, begin_mask_before, end_mask_before); + _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, + begin_mask_before, end_mask_before); concat_vector.emplace_back(&_slice_results[2 * i]); } else @@ -138,11 +150,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu } } concat_vector.push_back(prev); - if(_padding[i].second > 0) + if (_padding[i].second > 0) { - if(i < prev->info()->num_dimensions()) + if (i < prev->info()->num_dimensions()) { - _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, strides, begin_mask_after, end_mask_after); + _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, + strides, begin_mask_after, end_mask_after); concat_vector.emplace_back(&_slice_results[2 * i + 1]); } else @@ -154,12 +167,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu // Concatenate the padding before and after with the input. ITensor *out = (i == _num_dimensions - 1) ? output : &_concat_results[i]; out->info()->set_quantization_info(output->info()->quantization_info()); - for(auto &v : concat_vector) + for (auto &v : concat_vector) { v->info()->set_quantization_info(input->info()->quantization_info()); } _concat_functions[i].configure(concat_vector, out, i); - if(i != _num_dimensions - 1) + if (i != _num_dimensions - 1) { _concat_results[i].allocator()->allocate(); } @@ -170,7 +183,11 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu } } -void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode) +void NEPadLayer::configure(ITensor *input, + ITensor *output, + const PaddingList &padding, + const PixelValue constant_value, + const PaddingMode mode) { ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode)); ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode); @@ -178,15 +195,16 @@ void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &p _padding = padding; _mode = mode; - const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding); + const TensorShape padded_shape = + misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(padded_shape)); // Find the last dimension requiring padding so that it is known when to write to output and whether any padding is applied. _num_dimensions = last_padding_dimension(padding) + 1; - if(_num_dimensions > 0) + if (_num_dimensions > 0) { - switch(_mode) + switch (_mode) { case PaddingMode::CONSTANT: { @@ -210,19 +228,23 @@ void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &p } } -Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode) +Status NEPadLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + const PixelValue constant_value, + const PaddingMode mode) { ARM_COMPUTE_UNUSED(constant_value); const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding); - if(output->total_size() > 0) + if (output->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } - switch(mode) + switch (mode) { case PaddingMode::CONSTANT: { @@ -231,9 +253,9 @@ Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, case PaddingMode::REFLECT: case PaddingMode::SYMMETRIC: { - for(uint32_t i = 0; i < padding.size(); ++i) + for (uint32_t i = 0; i < padding.size(); ++i) { - if(mode == PaddingMode::REFLECT) + if (mode == PaddingMode::REFLECT) { ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first >= input->dimension(i)); ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second >= input->dimension(i)); @@ -256,9 +278,9 @@ Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, void NEPadLayer::run() { - if(_num_dimensions > 0) + if (_num_dimensions > 0) { - switch(_mode) + switch (_mode) { case PaddingMode::CONSTANT: { @@ -268,15 +290,15 @@ void NEPadLayer::run() case PaddingMode::REFLECT: case PaddingMode::SYMMETRIC: { - for(uint32_t i = 0; i < _num_dimensions; ++i) + for (uint32_t i = 0; i < _num_dimensions; ++i) { - if(_padding[i].first > 0 || _padding[i].second > 0) + if (_padding[i].first > 0 || _padding[i].second > 0) { - if(_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0) + if (_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0) { _slice_functions[2 * i].run(); } - if(_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0) + if (_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0) { _slice_functions[2 * i + 1].run(); } diff --git a/src/runtime/NEON/functions/NEPermute.cpp b/src/runtime/NEON/functions/NEPermute.cpp index 517b86a1cb..80cd04ce6c 100644 --- a/src/runtime/NEON/functions/NEPermute.cpp +++ b/src/runtime/NEON/functions/NEPermute.cpp @@ -24,19 +24,19 @@ #include "arm_compute/runtime/NEON/functions/NEPermute.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuPermute.h" namespace arm_compute { struct NEPermute::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEPermute::NEPermute() - : _impl(std::make_unique()) +NEPermute::NEPermute() : _impl(std::make_unique()) { } diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp index ad83a26beb..97155a9e74 100644 --- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp +++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h" #include "arm_compute/core/ITensor.h" + #include "src/cpu/operators/CpuMul.h" #include @@ -32,32 +33,42 @@ namespace arm_compute { struct NEPixelWiseMultiplication::Impl { - const ITensor *src_0{ nullptr }; - const ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src_0{nullptr}; + const ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEPixelWiseMultiplication::NEPixelWiseMultiplication() - : _impl(std::make_unique()) +NEPixelWiseMultiplication::NEPixelWiseMultiplication() : _impl(std::make_unique()) { } NEPixelWiseMultiplication::~NEPixelWiseMultiplication() = default; -Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, +Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) { return cpu::CpuMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info); } -void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, +void NEPixelWiseMultiplication::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; _impl->dst = output; _impl->op = std::make_unique(); - _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info); + _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, + act_info); } void NEPixelWiseMultiplication::run() @@ -71,24 +82,29 @@ void NEPixelWiseMultiplication::run() struct NEComplexPixelWiseMultiplication::Impl { - ITensor *src_0{ nullptr }; - ITensor *src_1{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + ITensor *src_0{nullptr}; + ITensor *src_1{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication() - : _impl(std::make_unique()) +NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication() : _impl(std::make_unique()) { } NEComplexPixelWiseMultiplication::~NEComplexPixelWiseMultiplication() = default; -Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info) +Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info) { return cpu::CpuComplexMul::validate(input1, input2, output, act_info); } -void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info) +void NEComplexPixelWiseMultiplication::configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info) { _impl->src_0 = input1; _impl->src_1 = input2; diff --git a/src/runtime/NEON/functions/NEPooling3dLayer.cpp b/src/runtime/NEON/functions/NEPooling3dLayer.cpp index 53f9dbf0a2..e017e8c21d 100644 --- a/src/runtime/NEON/functions/NEPooling3dLayer.cpp +++ b/src/runtime/NEON/functions/NEPooling3dLayer.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuPool3d.h" @@ -33,9 +34,9 @@ namespace arm_compute { struct NEPooling3dLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; MemoryGroup memory_group{}; ITensorPack run_pack{}; WorkspaceData workspace_tensors{}; @@ -43,8 +44,7 @@ struct NEPooling3dLayer::Impl NEPooling3dLayer::~NEPooling3dLayer() = default; -NEPooling3dLayer::NEPooling3dLayer(std::shared_ptr memory_manager) - : _impl(std::make_unique()) +NEPooling3dLayer::NEPooling3dLayer(std::shared_ptr memory_manager) : _impl(std::make_unique()) { _impl->memory_group = MemoryGroup(std::move(memory_manager)); } @@ -56,11 +56,12 @@ void NEPooling3dLayer::configure(const ITensor *input, ITensor *output, const Po _impl->op = std::make_unique(); _impl->op->configure(input->info(), output->info(), pool_info); - _impl->run_pack = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST_0, _impl->dst } }; + _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST_0, _impl->dst}}; _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } -Status NEPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info) +Status +NEPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info) { return cpu::CpuPool3d::validate(input, output, pool_info); } @@ -72,4 +73,4 @@ void NEPooling3dLayer::run() _impl->op->run(_impl->run_pack); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp index 5a3b9c5e7e..eb9125be3c 100644 --- a/src/runtime/NEON/functions/NEPoolingLayer.cpp +++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/cpu/operators/CpuPool2d.h" @@ -33,10 +34,10 @@ namespace arm_compute { struct NEPoolingLayer::Impl { - ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - ITensor *indices{ nullptr }; - std::unique_ptr op{ nullptr }; + ITensor *src{nullptr}; + ITensor *dst{nullptr}; + ITensor *indices{nullptr}; + std::unique_ptr op{nullptr}; MemoryGroup memory_group{}; ITensorPack run_pack{}; WorkspaceData workspace_tensors{}; @@ -44,8 +45,7 @@ struct NEPoolingLayer::Impl NEPoolingLayer::~NEPoolingLayer() = default; -NEPoolingLayer::NEPoolingLayer(std::shared_ptr memory_manager) - : _impl(std::make_unique()) +NEPoolingLayer::NEPoolingLayer(std::shared_ptr memory_manager) : _impl(std::make_unique()) { _impl->memory_group = MemoryGroup(std::move(memory_manager)); } @@ -58,11 +58,16 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay _impl->op = std::make_unique(); _impl->op->configure(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr); - _impl->run_pack = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST_0, _impl->dst }, { TensorType::ACL_DST_1, _impl->indices } }; + _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, + {TensorType::ACL_DST_0, _impl->dst}, + {TensorType::ACL_DST_1, _impl->indices}}; _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } -Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +Status NEPoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) { return cpu::CpuPool2d::validate(input, output, pool_info, indices); } diff --git a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp index aba09239cf..dbb6bf9df1 100644 --- a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp +++ b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp @@ -27,15 +27,19 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h" namespace arm_compute { -void NEPriorBoxLayer::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info) +void NEPriorBoxLayer::configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + const PriorBoxLayerInfo &info) { ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info); @@ -44,7 +48,10 @@ void NEPriorBoxLayer::configure(const ITensor *input1, const ITensor *input2, IT _kernel = std::move(k); } -Status NEPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info) +Status NEPriorBoxLayer::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info) { return NEPriorBoxLayerKernel::validate(input1, input2, output, info); } diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp index 2caaea02d8..dd78d10d16 100644 --- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp +++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp @@ -27,13 +27,14 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/QuantizationInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/InfoHelpers.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" -#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h" #include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h" namespace arm_compute @@ -41,12 +42,19 @@ namespace arm_compute using namespace arm_compute::utils::info_helpers; namespace { -Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias, - float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info) +Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, + const ITensorInfo *mm_input, + const ITensorInfo *mm_weights, + const ITensorInfo *bias, + float gemmlowp_scale, + const TensorInfo *mm_res_info, + const TensorInfo *outstage_tensor_info) { ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info)); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info)); return Status{}; } } // namespace @@ -55,10 +63,7 @@ Status NEQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInf { // Output quantization scale will be different, but ignored here // since it will be configured at configure() stage. - const TensorInfo out - { - in - }; + const TensorInfo out{in}; return NEQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias); } @@ -98,14 +103,12 @@ void NEQLSTMLayer::TensorCopyKernel::configure(ITensor &src, ITensor &dst) void NEQLSTMLayer::TensorCopyKernel::run() { - Iterator input_iter{ _src, _window }; - Iterator output_iter{ _dst, _window }; + Iterator input_iter{_src, _window}; + Iterator output_iter{_dst, _window}; - execute_window_loop(_window, [&](const Coordinates &) - { - memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); - }, - input_iter, output_iter); + execute_window_loop( + _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter, + output_iter); } NEQLSTMLayer::~NEQLSTMLayer() = default; @@ -191,10 +194,17 @@ NEQLSTMLayer::NEQLSTMLayer(std::shared_ptr memory_manager) _memory_group = MemoryGroup(std::move(memory_manager)); } -void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info, - const ITensor *mm_input, const ITensor *mm_weights, const ITensor *bias, - Tensor *mm_res, Tensor *outstage_res, float gemmlowp_scale, - const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info) +void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, + NEGEMMLowpOutputStage &outstage, + GEMMLowpOutputStageInfo &gemmlowp_info, + const ITensor *mm_input, + const ITensor *mm_weights, + const ITensor *bias, + Tensor *mm_res, + Tensor *outstage_res, + float gemmlowp_scale, + const TensorInfo &mm_res_info, + const TensorInfo &outstage_tensor_info) { _memory_group.manage(mm_res); _memory_group.manage(outstage_res); @@ -206,66 +216,87 @@ void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutp mm.configure(mm_input, mm_weights, nullptr, mm_res); // Configure output stage - quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); + quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); outstage.configure(mm_res, bias, outstage_res, gemmlowp_info); mm_res->allocator()->allocate(); } -void NEQLSTMLayer::configure(const ITensor *input, - const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights, - const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights, - const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias, - const ITensor *cell_state_in, ITensor *output_state_in, - ITensor *cell_state_out, ITensor *output_state_out, ITensor *output, +void NEQLSTMLayer::configure(const ITensor *input, + const ITensor *input_to_forget_weights, + const ITensor *input_to_cell_weights, + const ITensor *input_to_output_weights, + const ITensor *recurrent_to_forget_weights, + const ITensor *recurrent_to_cell_weights, + const ITensor *recurrent_to_output_weights, + const ITensor *forget_gate_bias, + const ITensor *cell_bias, + const ITensor *output_gate_bias, + const ITensor *cell_state_in, + ITensor *output_state_in, + ITensor *cell_state_out, + ITensor *output_state_out, + ITensor *output, const LSTMParams &lstm_params) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); + forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, + cell_state_out, output_state_out); ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, - forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out); + forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, + cell_state_out, output_state_out); // Set lstm parameters LSTMParams lstm_params_info{}; build_lstm_params_tensor_info(lstm_params, &lstm_params_info); - _input_to_forget_weights_transposed.info()->set_quantization_info(input_to_forget_weights->info()->quantization_info()); + _input_to_forget_weights_transposed.info()->set_quantization_info( + input_to_forget_weights->info()->quantization_info()); _input_to_cell_weights_transposed.info()->set_quantization_info(input_to_cell_weights->info()->quantization_info()); - _input_to_output_weights_transposed.info()->set_quantization_info(input_to_output_weights->info()->quantization_info()); - _recurrent_to_forget_weights_transposed.info()->set_quantization_info(recurrent_to_forget_weights->info()->quantization_info()); - _recurrent_to_cell_weights_transposed.info()->set_quantization_info(recurrent_to_cell_weights->info()->quantization_info()); - _recurrent_to_output_weights_transposed.info()->set_quantization_info(recurrent_to_output_weights->info()->quantization_info()); - - if(input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED) + _input_to_output_weights_transposed.info()->set_quantization_info( + input_to_output_weights->info()->quantization_info()); + _recurrent_to_forget_weights_transposed.info()->set_quantization_info( + recurrent_to_forget_weights->info()->quantization_info()); + _recurrent_to_cell_weights_transposed.info()->set_quantization_info( + recurrent_to_cell_weights->info()->quantization_info()); + _recurrent_to_output_weights_transposed.info()->set_quantization_info( + recurrent_to_output_weights->info()->quantization_info()); + + if (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED) { _convert_input_to_forget_weights_to_qsymm8 = true; // Setup dequantize output tensor to go from QASYMM8_SIGNED -> F32 - _input_to_forget_weights_f32.allocator()->init(TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::F32) - .set_data_layout(input_to_forget_weights->info()->data_layout())); + _input_to_forget_weights_f32.allocator()->init( + TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::F32) + .set_data_layout(input_to_forget_weights->info()->data_layout())); // Setup the quantize output tensor to go from F32 -> QSYMM8 - _input_to_forget_weights_symm8.allocator()->init((TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::QSYMM8) - .set_data_layout(input_to_forget_weights->info()->data_layout()) - .set_quantization_info(input_to_forget_weights->info()->quantization_info()))); + _input_to_forget_weights_symm8.allocator()->init( + (TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::QSYMM8) + .set_data_layout(input_to_forget_weights->info()->data_layout()) + .set_quantization_info(input_to_forget_weights->info()->quantization_info()))); _dequantize_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_f32); _quantize_input_to_forget_weights.configure(&_input_to_forget_weights_f32, &_input_to_forget_weights_symm8); - ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), _input_to_forget_weights_symm8.info(), input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), - lstm_params_info)); + ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate( + input->info(), _input_to_forget_weights_symm8.info(), input_to_cell_weights->info(), + input_to_output_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), + recurrent_to_output_weights->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), + cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), + output->info(), lstm_params_info)); } else { - ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(), - recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), - forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), - cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), - lstm_params_info)); + ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate( + input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), + input_to_output_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), + recurrent_to_output_weights->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), + cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), + output->info(), lstm_params_info)); } const int batch_size = input->info()->dimension(1); @@ -277,7 +308,9 @@ void NEQLSTMLayer::configure(const ITensor *input, const UniformQuantizationInfo qoutput_state_in = output_state_in->info()->quantization_info().uniform(); _projection_bias = lstm_params.projection_bias(); - _input_to_forget_weights = (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED) ? &_input_to_forget_weights_symm8 : input_to_forget_weights; + _input_to_forget_weights = (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED) + ? &_input_to_forget_weights_symm8 + : input_to_forget_weights; _input_to_cell_weights = input_to_cell_weights; _input_to_output_weights = input_to_output_weights; _recurrent_to_forget_weights = recurrent_to_forget_weights; @@ -287,7 +320,7 @@ void NEQLSTMLayer::configure(const ITensor *input, // Layer normalization _has_layer_norm = lstm_params.use_layer_norm(); - if(_has_layer_norm) + if (_has_layer_norm) { set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget); set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell); @@ -309,22 +342,25 @@ void NEQLSTMLayer::configure(const ITensor *input, // Calculate quantized parameters for clipping. int16_t quantized_cell_clip = 0; - if(lstm_params.cell_clip() > 0.0f) + if (lstm_params.cell_clip() > 0.0f) { quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in); } _has_cell_clipping = quantized_cell_clip > 0; // Precompute effective bias for optimizing the matmul computations. - if(!_has_cifg) + if (!_has_cifg) { _input_to_input_weights = lstm_params.input_to_input_weights(); _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights(); _input_to_input_reduction = std::make_unique(); _recurrent_to_input_reduction = std::make_unique(); - _input_to_input_reduction->configure(_input_to_input_weights->info(), _input_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_input_reduction->configure(_recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_input_reduction->configure(_input_to_input_weights->info(), _input_to_input_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_input_reduction->configure( + _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); } _input_to_forget_reduction = std::make_unique(); @@ -334,19 +370,31 @@ void NEQLSTMLayer::configure(const ITensor *input, _input_to_output_reduction = std::make_unique(); _recurrent_to_output_reduction = std::make_unique(); - _input_to_forget_reduction->configure(input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_forget_reduction->configure(recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_cell_reduction->configure(input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_cell_reduction->configure(recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - _input_to_output_reduction->configure(input_to_output_weights->info(), _input_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); - _recurrent_to_output_reduction->configure(recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); - if(_has_projection) + _input_to_forget_reduction->configure(input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_forget_reduction->configure( + recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_cell_reduction->configure(input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_cell_reduction->configure( + recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + _input_to_output_reduction->configure(input_to_output_weights->info(), _input_to_output_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)); + _recurrent_to_output_reduction->configure( + recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)); + if (_has_projection) { _projection_reduction = std::make_unique(); - _projection_reduction->configure(_projection_weights->info(), _projection_eff_bias.info(), GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); - if(_projection_bias != nullptr) + _projection_reduction->configure( + _projection_weights->info(), _projection_eff_bias.info(), + GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)); + if (_projection_bias != nullptr) { - _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE); + _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias, + ConvertPolicy::SATURATE); } } @@ -354,15 +402,19 @@ void NEQLSTMLayer::configure(const ITensor *input, _transpose_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_transposed); _transpose_input_to_cell_weights.configure(input_to_cell_weights, &_input_to_cell_weights_transposed); _transpose_input_to_output_weights.configure(input_to_output_weights, &_input_to_output_weights_transposed); - _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed); + _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights, + &_recurrent_to_forget_weights_transposed); _transpose_recurrent_to_cell_weights.configure(recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed); - _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights, &_recurrent_to_output_weights_transposed); - if(!_has_cifg) + _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights, + &_recurrent_to_output_weights_transposed); + if (!_has_cifg) { - _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed); - _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed); + _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(), + &_input_to_input_weights_transposed); + _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(), + &_recurrent_to_input_weights_transposed); } - if(_has_projection) + if (_has_projection) { _transpose_projection_weights.configure(_projection_weights, &_projection_weights_transposed); } @@ -375,40 +427,52 @@ void NEQLSTMLayer::configure(const ITensor *input, const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32); // Forget gate. - const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); - const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale(); - configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, - input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, - &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale, - mm_out_info, forget_gate_outstage_info); - - const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); - configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias, - &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale, - mm_out_info, forget_gate_outstage_info); - - _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); + const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); + const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.forget_intermediate_scale(); + configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input, + &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res, + &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info); + + const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); + configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias, &_mm_recurrent_to_forget_res, + &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale, mm_out_info, forget_gate_outstage_info); + + _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, + &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); _input_to_forget_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { _mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_forget_res); - _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(), + &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + _cell_to_forget_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_forget_outstage_res); - const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info); + const float cell_to_forget_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / + lstm_params.forget_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, + gemmlowp_info); _mul_cell_to_forget_res.allocator()->allocate(); - _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); + _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, + &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE); _cell_to_forget_outstage_res.allocator()->allocate(); } Tensor *forget_activation_input = &_recurrent_to_forget_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Forget, forget_activation_input); forget_activation_input->allocator()->allocate(); @@ -417,33 +481,36 @@ void NEQLSTMLayer::configure(const ITensor *input, // Output quantization info of Sigmoid and Tanh activations const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0); - const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); + const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _memory_group.manage(&_forget_gate); _forget_gate.allocator()->init(forget_gate_info); - _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); forget_activation_input->allocator()->allocate(); // Modulation gate. - const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); - const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale(); - configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, - input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, - &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale, + const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); + const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.cell_intermediate_scale(); + configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input, &_input_to_cell_weights_transposed, + &_input_to_cell_eff_bias, &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale, mm_out_info, cell_outstage_info); - const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); - configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, - &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, - mm_out_info, cell_outstage_info); + const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); + configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res, + &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info); - _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE); + _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, + &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE); _input_to_cell_outstage_res.allocator()->allocate(); Tensor *cell_activation_input = &_recurrent_to_cell_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Cell, cell_activation_input); cell_activation_input->allocator()->allocate(); @@ -454,14 +521,15 @@ void NEQLSTMLayer::configure(const ITensor *input, _memory_group.manage(&_cell_gate); _cell_gate.allocator()->init(cell_gate_info); - _cell_gate_tanh.configure(cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + _cell_gate_tanh.configure(cell_activation_input, &_cell_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); cell_activation_input->allocator()->allocate(); // Input gate. const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); _input_gate.allocator()->init(input_gate_info); _memory_group.manage(&_input_gate); - if(_has_cifg) + if (_has_cifg) { _ones.allocator()->init(*_forget_gate.info()); _input_gate_sub.configure(&_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE); @@ -469,104 +537,137 @@ void NEQLSTMLayer::configure(const ITensor *input, } else { - const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); - const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale(); - configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info, - input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias, - &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale, - mm_out_info, input_outstage_info); - - const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); - configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias, + const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); + const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.input_intermediate_scale(); + configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input, + &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res, + &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info); + + const float recurrent_to_input_scale = + _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / + lstm_params.input_intermediate_scale(); + configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias, &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale, mm_out_info, input_outstage_info); - _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); + _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, + &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); _input_to_input_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { - _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); + _mul_cell_to_input_res.allocator()->init( + TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_input_res); - _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(), + &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + const float cell_to_input_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / + lstm_params.input_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_input_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_input_outstage_res); - _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info); + _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, + gemmlowp_info); _mul_cell_to_input_res.allocator()->allocate(); - _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); + _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, + &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE); _cell_to_input_outstage_res.allocator()->allocate(); } Tensor *input_activation_input = &_recurrent_to_input_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Input, input_activation_input); input_activation_input->allocator()->allocate(); input_activation_input = &get_layer_norm_output(LayerNormGate::Input); } - _input_gate_sigmoid.configure(input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _input_gate_sigmoid.configure(input_activation_input, &_input_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); input_activation_input->allocator()->allocate(); } // Cell. // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication - _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); const float cell_gate_scale = _cell_gate.info()->quantization_info().uniform().scale; const float mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift); - const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0)); + const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(mul_input_cell_scale, 0)); _memory_group.manage(&_mul_input_cell_res); _mul_input_cell_res.allocator()->init(mul_input_cell_info); - _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _cell_gate.allocator()->allocate(); _add_forget_cell.configure(&_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE); _mul_input_cell_res.allocator()->allocate(); _forget_gate.allocator()->allocate(); - if(_has_cell_clipping) + if (_has_cell_clipping) { - _cell_clip.configure(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip)); + _cell_clip.configure(cell_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_cell_clip, quantized_cell_clip)); } // Output gate. - const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); - const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale(); - configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info, - input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias, - &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale, - mm_out_info, output_outstage_info); - - const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); - configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, - output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias, - &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale, - mm_out_info, output_outstage_info); - - _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); + const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); + const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * + qinput.scale / lstm_params.output_intermediate_scale(); + configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input, + &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res, + &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info); + + const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.output_intermediate_scale(); + configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, output_state_in, + &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias, &_mm_recurrent_to_output_res, + &_recurrent_to_output_outstage_res, recurrent_to_output_scale, mm_out_info, output_outstage_info); + + _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, + &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); _input_to_output_outstage_res.allocator()->allocate(); - if(_has_peephole) + if (_has_peephole) { // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication // Here we are not using the output stage because all operations are done in float _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32)); _memory_group.manage(&_mul_cell_to_output_res); - _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); - - const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale(); - quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift); - _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0))); + _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(), + &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); + + const float cell_to_output_scale = + std::pow(2, cell_shift) * + lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / + lstm_params.output_intermediate_scale(); + quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift); + _cell_to_output_outstage_res.allocator()->init( + TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0))); _memory_group.manage(&_cell_to_output_outstage_res); - _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info); + _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, + gemmlowp_info); _mul_cell_to_output_res.allocator()->allocate(); - _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); + _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, + &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE); _cell_to_output_outstage_res.allocator()->allocate(); } Tensor *output_activation_input = &_recurrent_to_output_outstage_res; - if(_has_layer_norm) + if (_has_layer_norm) { configure_layer_norm(LayerNormGate::Output, output_activation_input); output_activation_input->allocator()->allocate(); @@ -576,20 +677,24 @@ void NEQLSTMLayer::configure(const ITensor *input, _memory_group.manage(&_output_gate); _output_gate.allocator()->init(output_gate_info); - _output_gate_sigmoid.configure(output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); + _output_gate_sigmoid.configure(output_activation_input, &_output_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)); output_activation_input->allocator()->allocate(); // Hidden. - _hidden_tanh.configure(cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); + _hidden_tanh.configure(cell_state_out, &_input_gate, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)); // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication _memory_group.manage(&_hidden_mul_res); const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32); _hidden_mul_res.allocator()->init(hidden_mul_res); - _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO); + _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO); _output_gate.allocator()->allocate(); _input_gate.allocator()->allocate(); const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15); - quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true); + quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true); gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); gemmlowp_info.output_data_type = output_state_in->info()->data_type(); @@ -598,7 +703,7 @@ void NEQLSTMLayer::configure(const ITensor *input, _memory_group.manage(&_hidden_gate); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_gate.allocator()->init(*output_state_out->info()); _hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape()); @@ -609,27 +714,26 @@ void NEQLSTMLayer::configure(const ITensor *input, _hidden_mul_res.allocator()->allocate(); // Projection. - if(_has_projection) + if (_has_projection) { const TensorInfo projection_outstage_info(*output_state_out->info()); - const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform(); - const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; - gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; - gemmlowp_info.gemmlowp_min_bound = std::numeric_limits::lowest(); - gemmlowp_info.gemmlowp_max_bound = std::numeric_limits::max(); - gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; - - TensorInfo projection_mm_out_info{ mm_out_info }; + const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform(); + const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; + gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; + gemmlowp_info.gemmlowp_min_bound = std::numeric_limits::lowest(); + gemmlowp_info.gemmlowp_max_bound = std::numeric_limits::max(); + gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; + + TensorInfo projection_mm_out_info{mm_out_info}; projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size)); - configure_mm(_mm_projection, _projection_outstage, gemmlowp_info, - hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias, - &_mm_projection_res, &_projection_outstage_res, projection_scale, - projection_mm_out_info, projection_outstage_info); + configure_mm(_mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result, + &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res, + &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info); ITensor *accumulate_destination = output_state_out; - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_gate.allocator()->allocate(); _projection_accumulate_res.allocator()->init(*output_state_in->info()); @@ -638,30 +742,34 @@ void NEQLSTMLayer::configure(const ITensor *input, accumulate_destination = &_projection_accumulate_res; } - _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE); + _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination, + ConvertPolicy::SATURATE); _projection_outstage_res.allocator()->allocate(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out); _projection_accumulate_res.allocator()->allocate(); } - int8_t quantized_projection_clip{ 0 }; - if(lstm_params.projection_clip() > 0.0f) + int8_t quantized_projection_clip{0}; + if (lstm_params.projection_clip() > 0.0f) { - quantized_projection_clip = utility::clamp(lstm_params.projection_clip() / qprojection.scale, -128, 127); + quantized_projection_clip = + utility::clamp(lstm_params.projection_clip() / qprojection.scale, -128, 127); } - if(quantized_projection_clip > 0) + if (quantized_projection_clip > 0) { - _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, quantized_projection_clip)); + _projection_clip.configure(output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_projection_clip, quantized_projection_clip)); _has_projection_clipping = true; } } else { - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_to_output_copy.configure(_hidden_gate, *output_state_out); _hidden_gate.allocator()->allocate(); @@ -672,17 +780,27 @@ void NEQLSTMLayer::configure(const ITensor *input, _copy_output.configure(output_state_out, output); } -Status NEQLSTMLayer::validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output, +Status NEQLSTMLayer::validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out, + const ITensorInfo *output, const LSTMParams &lstm_params) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, - recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, - cell_state_out, output_state_out, output); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, + cell_state_in, output_state_in, cell_state_out, output_state_out, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions"); @@ -694,22 +812,27 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2); ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, + input_to_cell_weights); ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2); ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QASYMM8_SIGNED, DataType::QSYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QASYMM8_SIGNED, + DataType::QSYMM8); // If the input_to_forget_weights data type is DataType::QSYMM8 then it can never match the other weights as they are all DataType::QASYMM8_SIGNED if (input_to_forget_weights->data_type() == DataType::QSYMM8) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + recurrent_to_forget_weights, recurrent_to_cell_weights, + recurrent_to_output_weights); } else { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights, - recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, + input_to_output_weights, recurrent_to_forget_weights, + recurrent_to_cell_weights, recurrent_to_output_weights); } ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1); ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units); @@ -728,20 +851,25 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in); // Check whether peephole weights are all there or none - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, + DataType::QSYMM16); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_output_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_output_weights()); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), + lstm_params.cell_to_input_weights()); } } @@ -755,7 +883,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // Calculate quantized parameters for clipping. int16_t quantized_cell_clip = 0; - if(lstm_params.cell_clip() > 0.0f) + if (lstm_params.cell_clip() > 0.0f) { quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in); } @@ -763,60 +891,90 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // Precompute effective bias for optimizing the matmul computations. const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32); const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32); - if(!lstm_params.has_cifg_opt()) + if (!lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, - -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, - -qoutput_state_in.offset, - true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + lstm_params.input_to_input_weights(), &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + lstm_params.recurrent_to_input_weights(), &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); } - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, - -qoutput_state_in.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, - true))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, - -qoutput_state_in.offset, true))); - if(lstm_params.has_projection()) + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + recurrent_to_forget_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + recurrent_to_cell_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + recurrent_to_output_weights, &eff_bias_info, + GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true))); + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false, - lstm_params.hidden_state_zero(), - true))); - if(lstm_params.projection_bias() != nullptr) + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate( + lstm_params.projection_weights(), &projection_eff_bias_info, + GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true))); + if (lstm_params.projection_bias() != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, &projection_eff_bias_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, + &projection_eff_bias_info, ConvertPolicy::SATURATE)); } } - const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_cell_weights->data_type(), input_to_cell_weights->quantization_info()); - const TensorInfo input_to_output_weights_transposed(TensorShape(num_units, input_size), 1, input_to_output_weights->data_type(), input_to_output_weights->quantization_info()); - const TensorInfo recurrent_to_forget_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info()); - const TensorInfo recurrent_to_cell_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_cell_weights->data_type(), recurrent_to_cell_weights->quantization_info()); - const TensorInfo recurrent_to_output_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_output_weights->data_type(), recurrent_to_output_weights->quantization_info()); - const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info()); + const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_cell_weights->data_type(), + input_to_cell_weights->quantization_info()); + const TensorInfo input_to_output_weights_transposed(TensorShape(num_units, input_size), 1, + input_to_output_weights->data_type(), + input_to_output_weights->quantization_info()); + const TensorInfo recurrent_to_forget_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_forget_weights->data_type(), + recurrent_to_forget_weights->quantization_info()); + const TensorInfo recurrent_to_cell_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_cell_weights->data_type(), + recurrent_to_cell_weights->quantization_info()); + const TensorInfo recurrent_to_output_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_output_weights->data_type(), + recurrent_to_output_weights->quantization_info()); + const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, + recurrent_to_forget_weights->data_type(), + recurrent_to_forget_weights->quantization_info()); ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_cell_weights, &input_weights_transposed)); ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_output_weights, &input_to_output_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_forget_weights, &recurrent_to_forget_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_cell_weights, &recurrent_to_cell_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_output_weights, &recurrent_to_output_weights_transposed)); - if(!lstm_params.has_cifg_opt()) + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(recurrent_to_forget_weights, &recurrent_to_forget_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(recurrent_to_cell_weights, &recurrent_to_cell_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(recurrent_to_output_weights, &recurrent_to_output_weights_transposed)); + if (!lstm_params.has_cifg_opt()) { - const TensorInfo recurrent_to_input_weights_transposed(TensorShape(num_units, output_size), 1, - recurrent_to_forget_weights->data_type(), lstm_params.recurrent_to_input_weights()->quantization_info()); + const TensorInfo recurrent_to_input_weights_transposed( + TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), + lstm_params.recurrent_to_input_weights()->quantization_info()); const TensorInfo input_to_input_weights_transposed(TensorShape(num_units, input_size), 1, - lstm_params.input_to_input_weights()->data_type(), lstm_params.input_to_input_weights()->quantization_info()); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.input_to_input_weights(), &input_to_input_weights_transposed)); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_to_input_weights_transposed)); + lstm_params.input_to_input_weights()->data_type(), + lstm_params.input_to_input_weights()->quantization_info()); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(lstm_params.input_to_input_weights(), &input_to_input_weights_transposed)); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_to_input_weights_transposed)); } - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { - const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info()); - ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed)); + const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, + lstm_params.projection_weights()->data_type(), + lstm_params.projection_weights()->quantization_info()); + ARM_COMPUTE_RETURN_ON_ERROR( + NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed)); } GEMMLowpOutputStageInfo gemmlowp_info; @@ -829,28 +987,42 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // Forget gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0); - const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); + const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)); const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32); - const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info)); + const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_forget_scale, &mm_out_info, &forget_outstage_info)); - const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info)); + const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, + &forget_outstage_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, + &forget_outstage_info, ConvertPolicy::SATURATE)); - if(lstm_params.has_peephole_opt()) + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, + DataType::QSYMM16); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, + ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + const float cell_to_forget_scale = std::pow(2, cell_shift) * + lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / + lstm_params.forget_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, + &forget_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights(); const ITensorInfo *b_info = forget_gate_bias; @@ -859,22 +1031,31 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, // Output quantization info of Sigmoid and Tanh activations const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0); - const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); + const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Modulation gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0); - const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); - const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info)); - - const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE)); - - if(has_layer_norm) + const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.cell_intermediate_scale(), 0)); + const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.cell_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_cell_scale, &mm_out_info, &cell_outstage_info)); + + const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.cell_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, + &cell_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, + &cell_outstage_info, ConvertPolicy::SATURATE)); + + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights(); const ITensorInfo *b_info = cell_bias; @@ -882,94 +1063,134 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, } const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&cell_outstage_info, &cell_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); // Input gate. const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - if(lstm_params.has_cifg_opt()) + if (lstm_params.has_cifg_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used"); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, + "Input gate bias must not be present when CIFG is used"); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, + &forget_gate_info, ConvertPolicy::SATURATE)); } else { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), + lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias()); // If the input_to_forget_weights data type is DataType::QSYMM8 then it can never match the other weights as they are all DataType::QASYMM8_SIGNED if (input_to_forget_weights->data_type() == DataType::QSYMM8) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.input_to_input_weights(), + lstm_params.recurrent_to_input_weights()); } else { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, + lstm_params.input_to_input_weights(), + lstm_params.recurrent_to_input_weights()); } ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, + lstm_params.recurrent_to_input_weights()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias()); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0); - const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); - const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info)); - - const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); - - if(lstm_params.has_peephole_opt()) + const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.input_intermediate_scale(), 0)); + const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * + qinput.scale / lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_input_scale, &mm_out_info, &input_outstage_info)); + + const float recurrent_to_input_scale = + lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / + lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_input_scale, &mm_out_info, + &input_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, + &input_outstage_info, ConvertPolicy::SATURATE)); + + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, + 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + const float cell_to_input_scale = std::pow(2, cell_shift) * + lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / + lstm_params.input_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, + &input_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.input_layer_norm_weights(); const ITensorInfo *b_info = lstm_params.input_gate_bias(); ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(input_outstage_info, *w_info, *b_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&input_outstage_info, &input_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); } // Cell. - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); - if(quantized_cell_clip > 0) + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE)); + if (quantized_cell_clip > 0) { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, - quantized_cell_clip))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(cell_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_cell_clip, quantized_cell_clip))); } // Output gate. ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0); - const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); - const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info)); - - const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale(); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info)); - - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); - if(lstm_params.has_peephole_opt()) + const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, + QuantizationInfo(lstm_params.output_intermediate_scale(), 0)); + const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / + lstm_params.output_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, + input_to_output_scale, &mm_out_info, &output_outstage_info)); + + const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * + qoutput_state_in.scale / lstm_params.output_intermediate_scale(); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, + &eff_bias_info, recurrent_to_output_scale, &mm_out_info, + &output_outstage_info)); + + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, + &output_outstage_info, ConvertPolicy::SATURATE)); + if (lstm_params.has_peephole_opt()) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, + DataType::QSYMM16); // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication // Here we are not using the output stage because all operations are done in float // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale(); // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, - RoundingPolicy::TO_ZERO)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE, + RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, + &output_outstage_info, ConvertPolicy::SATURATE)); } - if(has_layer_norm) + if (has_layer_norm) { const ITensorInfo *w_info = lstm_params.output_layer_norm_weights(); const ITensorInfo *b_info = output_gate_bias; @@ -977,85 +1198,103 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input, } const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo); - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(&output_outstage_info, &output_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC))); // Hidden. - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); + ARM_COMPUTE_RETURN_ON_ERROR( + NEActivationLayer::validate(cell_state_out, &input_gate_info, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f))); const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32); const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); + ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate( + &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO)); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0); const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, + &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true)); gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero(); gemmlowp_info.output_data_type = hidden_out_info.data_type(); - ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info)); const bool projection_tensor_copy_required = num_units != output_size; // Projection. - if(lstm_params.has_projection()) + if (lstm_params.has_projection()) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, + lstm_params.projection_weights()); ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0); - const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform(); - const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); + const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform(); + const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale; + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier( + projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift)); gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset; gemmlowp_info.gemmlowp_min_bound = std::numeric_limits::lowest(); gemmlowp_info.gemmlowp_max_bound = std::numeric_limits::max(); gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED; const TensorInfo projection_outstage_info(*output_state_out); - const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info()); + const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, + lstm_params.projection_weights()->data_type(), + lstm_params.projection_weights()->quantization_info()); - TensorInfo projection_mm_out_info{ mm_out_info }; + TensorInfo projection_mm_out_info{mm_out_info}; projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info, + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, + &projection_eff_bias_info, projection_scale, &projection_mm_out_info, &projection_outstage_info)); - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { - ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info)); } - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, + ConvertPolicy::SATURATE)); - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { - ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out)); } - int8_t quantized_projection_clip{ 0 }; - if(lstm_params.projection_clip() > 0.0f) + int8_t quantized_projection_clip{0}; + if (lstm_params.projection_clip() > 0.0f) { quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection); } - if(quantized_projection_clip > 0) + if (quantized_projection_clip > 0) { - ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, - quantized_projection_clip))); + ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate( + output_state_out, nullptr, + ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + -quantized_projection_clip, quantized_projection_clip))); } } else { - if(projection_tensor_copy_required) + if (projection_tensor_copy_required) { ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out)); } } - if(cell_state_out->total_size() > 0) + if (cell_state_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out); } - if(output_state_out->total_size() > 0) + if (output_state_out->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out); @@ -1080,14 +1319,14 @@ void NEQLSTMLayer::run() _recurrent_to_forget_outstage.run(); _accumulate_input_recurrent_forget.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_forget.run(); _cell_to_forget_outstage.run(); _accumulate_cell_forget.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Forget).get(), Window::DimY); } @@ -1102,7 +1341,7 @@ void NEQLSTMLayer::run() _recurrent_to_cell_outstage.run(); _accumulate_input_recurrent_modulation.run(); - if(_has_layer_norm) + if (_has_layer_norm) { NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Cell).get(), Window::DimY); } @@ -1110,7 +1349,7 @@ void NEQLSTMLayer::run() _cell_gate_tanh.run(); // Input gate - if(_has_cifg) + if (_has_cifg) { _input_gate_sub.run(); } @@ -1122,14 +1361,14 @@ void NEQLSTMLayer::run() _recurrent_to_input_outstage.run(); _accumulate_input_recurrent_input.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_input.run(); _cell_to_input_outstage.run(); _accumulate_cell_input.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Input).get(), Window::DimY); } @@ -1142,7 +1381,7 @@ void NEQLSTMLayer::run() _pixelwise_mul_input_cell.run(); _add_forget_cell.run(); - if(_has_cell_clipping) + if (_has_cell_clipping) { _cell_clip.run(); } @@ -1153,14 +1392,14 @@ void NEQLSTMLayer::run() _mm_recurrent_to_output.run(); _recurrent_to_output_outstage.run(); _accumulate_input_recurrent_output.run(); - if(_has_peephole) + if (_has_peephole) { _pixelwise_mul_cell_to_output.run(); _cell_to_output_outstage.run(); _accumulate_cell_to_output.run(); } - if(_has_layer_norm) + if (_has_layer_norm) { NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Output).get(), Window::DimY); } @@ -1173,31 +1412,31 @@ void NEQLSTMLayer::run() _hidden_outstage.run(); // Projection. - if(_has_projection) + if (_has_projection) { _mm_projection.run(); _projection_outstage.run(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_output_to_accumulate_copy.run(); } _accumulate_projection.run(); - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _projection_accumulate_to_output_copy.run(); } - if(_has_projection_clipping) + if (_has_projection_clipping) { _projection_clip.run(); } } else { - if(_projection_tensor_copy_required) + if (_projection_tensor_copy_required) { _hidden_to_output_copy.run(); } @@ -1209,9 +1448,9 @@ void NEQLSTMLayer::run() void NEQLSTMLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { - if(_convert_input_to_forget_weights_to_qsymm8) + if (_convert_input_to_forget_weights_to_qsymm8) { _input_to_forget_weights_f32.allocator()->allocate(); _input_to_forget_weights_symm8.allocator()->allocate(); @@ -1234,28 +1473,25 @@ void NEQLSTMLayer::prepare() _transpose_recurrent_to_output_weights.run(); // Precompute effective biases - if(_has_cifg) + if (_has_cifg) { - std::fill_n(reinterpret_cast(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767); + std::fill_n(reinterpret_cast(_ones.buffer()), + _ones.info()->total_size() / _ones.info()->element_size(), 32767); } else { _input_to_input_eff_bias.allocator()->allocate(); _recurrent_to_input_eff_bias.allocator()->allocate(); - ITensorPack packII = - { - { TensorType::ACL_SRC, _input_to_input_weights }, - { TensorType::ACL_DST, &_input_to_input_eff_bias } - }; - NEScheduler::get().schedule_op(_input_to_input_reduction.get(), Window::DimY, _input_to_input_reduction->window(), packII); + ITensorPack packII = {{TensorType::ACL_SRC, _input_to_input_weights}, + {TensorType::ACL_DST, &_input_to_input_eff_bias}}; + NEScheduler::get().schedule_op(_input_to_input_reduction.get(), Window::DimY, + _input_to_input_reduction->window(), packII); - ITensorPack packRI = - { - { TensorType::ACL_SRC, _recurrent_to_input_weights }, - { TensorType::ACL_DST, &_recurrent_to_input_eff_bias } - }; - NEScheduler::get().schedule_op(_recurrent_to_input_reduction.get(), Window::DimY, _recurrent_to_input_reduction->window(), packRI); + ITensorPack packRI = {{TensorType::ACL_SRC, _recurrent_to_input_weights}, + {TensorType::ACL_DST, &_recurrent_to_input_eff_bias}}; + NEScheduler::get().schedule_op(_recurrent_to_input_reduction.get(), Window::DimY, + _recurrent_to_input_reduction->window(), packRI); _input_to_input_weights_transposed.allocator()->allocate(); _recurrent_to_input_weights_transposed.allocator()->allocate(); @@ -1271,58 +1507,44 @@ void NEQLSTMLayer::prepare() _input_to_output_eff_bias.allocator()->allocate(); _recurrent_to_output_eff_bias.allocator()->allocate(); - ITensorPack packIF = - { - { TensorType::ACL_SRC, _input_to_forget_weights }, - { TensorType::ACL_DST, &_input_to_forget_eff_bias } - }; - NEScheduler::get().schedule_op(_input_to_forget_reduction.get(), Window::DimY, _input_to_forget_reduction->window(), packIF); - - ITensorPack packRF = - { - { TensorType::ACL_SRC, _recurrent_to_forget_weights }, - { TensorType::ACL_DST, &_recurrent_to_forget_eff_bias } - }; - NEScheduler::get().schedule_op(_recurrent_to_forget_reduction.get(), Window::DimY, _recurrent_to_forget_reduction->window(), packRF); - - ITensorPack packIC = - { - { TensorType::ACL_SRC, _input_to_cell_weights }, - { TensorType::ACL_DST, &_input_to_cell_eff_bias } - }; - NEScheduler::get().schedule_op(_input_to_cell_reduction.get(), Window::DimY, _input_to_cell_reduction->window(), packIC); - - ITensorPack packRC = - { - { TensorType::ACL_SRC, _recurrent_to_cell_weights }, - { TensorType::ACL_DST, &_recurrent_to_cell_eff_bias } - }; - NEScheduler::get().schedule_op(_recurrent_to_cell_reduction.get(), Window::DimY, _recurrent_to_cell_reduction->window(), packRC); - - ITensorPack packIO = - { - { TensorType::ACL_SRC, _input_to_output_weights }, - { TensorType::ACL_DST, &_input_to_output_eff_bias } - }; - NEScheduler::get().schedule_op(_input_to_output_reduction.get(), Window::DimY, _input_to_output_reduction->window(), packIO); - - ITensorPack packRO = - { - { TensorType::ACL_SRC, _recurrent_to_output_weights }, - { TensorType::ACL_DST, &_recurrent_to_output_eff_bias } - }; - NEScheduler::get().schedule_op(_recurrent_to_output_reduction.get(), Window::DimY, _recurrent_to_output_reduction->window(), packRO); - - if(_has_projection) + ITensorPack packIF = {{TensorType::ACL_SRC, _input_to_forget_weights}, + {TensorType::ACL_DST, &_input_to_forget_eff_bias}}; + NEScheduler::get().schedule_op(_input_to_forget_reduction.get(), Window::DimY, + _input_to_forget_reduction->window(), packIF); + + ITensorPack packRF = {{TensorType::ACL_SRC, _recurrent_to_forget_weights}, + {TensorType::ACL_DST, &_recurrent_to_forget_eff_bias}}; + NEScheduler::get().schedule_op(_recurrent_to_forget_reduction.get(), Window::DimY, + _recurrent_to_forget_reduction->window(), packRF); + + ITensorPack packIC = {{TensorType::ACL_SRC, _input_to_cell_weights}, + {TensorType::ACL_DST, &_input_to_cell_eff_bias}}; + NEScheduler::get().schedule_op(_input_to_cell_reduction.get(), Window::DimY, _input_to_cell_reduction->window(), + packIC); + + ITensorPack packRC = {{TensorType::ACL_SRC, _recurrent_to_cell_weights}, + {TensorType::ACL_DST, &_recurrent_to_cell_eff_bias}}; + NEScheduler::get().schedule_op(_recurrent_to_cell_reduction.get(), Window::DimY, + _recurrent_to_cell_reduction->window(), packRC); + + ITensorPack packIO = {{TensorType::ACL_SRC, _input_to_output_weights}, + {TensorType::ACL_DST, &_input_to_output_eff_bias}}; + NEScheduler::get().schedule_op(_input_to_output_reduction.get(), Window::DimY, + _input_to_output_reduction->window(), packIO); + + ITensorPack packRO = {{TensorType::ACL_SRC, _recurrent_to_output_weights}, + {TensorType::ACL_DST, &_recurrent_to_output_eff_bias}}; + NEScheduler::get().schedule_op(_recurrent_to_output_reduction.get(), Window::DimY, + _recurrent_to_output_reduction->window(), packRO); + + if (_has_projection) { _projection_eff_bias.allocator()->allocate(); - ITensorPack pack = - { - { TensorType::ACL_SRC, _projection_weights }, - { TensorType::ACL_DST, &_projection_eff_bias } - }; - NEScheduler::get().schedule_op(_projection_reduction.get(), Window::DimY, _projection_reduction->window(), pack); - if(_projection_bias != nullptr) + ITensorPack pack = {{TensorType::ACL_SRC, _projection_weights}, + {TensorType::ACL_DST, &_projection_eff_bias}}; + NEScheduler::get().schedule_op(_projection_reduction.get(), Window::DimY, _projection_reduction->window(), + pack); + if (_projection_bias != nullptr) { _projection_bias_add.run(); _projection_bias->mark_as_unused(); @@ -1332,7 +1554,7 @@ void NEQLSTMLayer::prepare() _transpose_projection_weights.run(); _projection_weights->mark_as_unused(); - if(!_projection_tensor_copy_required) + if (!_projection_tensor_copy_required) { _hidden_gate.mark_as_unused(); _projection_accumulate_res.mark_as_unused(); diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp index dad246ac89..9b72783c97 100644 --- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp +++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp @@ -26,19 +26,19 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/Tensor.h" + #include "src/cpu/operators/CpuQuantize.h" namespace arm_compute { struct NEQuantizationLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEQuantizationLayer::NEQuantizationLayer() - : _impl(std::make_unique()) +NEQuantizationLayer::NEQuantizationLayer() : _impl(std::make_unique()) { } NEQuantizationLayer::~NEQuantizationLayer() = default; diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp index a66ef3d27a..2824693800 100644 --- a/src/runtime/NEON/functions/NERNNLayer.cpp +++ b/src/runtime/NEON/functions/NERNNLayer.cpp @@ -27,9 +27,10 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" namespace arm_compute @@ -37,13 +38,26 @@ namespace arm_compute NERNNLayer::~NERNNLayer() = default; NERNNLayer::NERNNLayer(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_f(), _activation(), _fully_connected(memory_manager), _copy_f(), _fully_connected_out(), _gemm_output(), _add_output(), + : _memory_group(std::move(memory_manager)), + _gemm_state_f(), + _add_f(), + _activation(), + _fully_connected(memory_manager), + _copy_f(), + _fully_connected_out(), + _gemm_output(), + _add_output(), _is_prepared(false) { } -Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state, - const ITensorInfo *output, const ActivationLayerInfo &info) +Status NERNNLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *recurrent_weights, + const ITensorInfo *bias, + const ITensorInfo *hidden_state, + const ITensorInfo *output, + const ActivationLayerInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32); @@ -60,24 +74,34 @@ Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape()); - auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type()); + auto shape_info = + TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, + input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info)); - ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); + ARM_COMPUTE_RETURN_ON_ERROR( + NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE)); ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&shape_info, &shape_info, info)); return Status{}; } -void NERNNLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, const ITensor *bias, ITensor *hidden_state, ITensor *output, +void NERNNLayer::configure(const ITensor *input, + const ITensor *weights, + const ITensor *recurrent_weights, + const ITensor *bias, + ITensor *hidden_state, + ITensor *output, ActivationLayerInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output); - ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info)); + ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), + bias->info(), hidden_state->info(), output->info(), info)); ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info); const int idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); - TensorShape shape = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height)); + TensorShape shape = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(), + hidden_state->info()->dimension(idx_height)); _is_prepared = false; @@ -125,7 +149,7 @@ void NERNNLayer::run() void NERNNLayer::prepare() { - if(!_is_prepared) + if (!_is_prepared) { _fully_connected.prepare(); _gemm_state_f.prepare(); diff --git a/src/runtime/NEON/functions/NEROIAlignLayer.cpp b/src/runtime/NEON/functions/NEROIAlignLayer.cpp index a9bdb50d95..68bb5d5ef3 100644 --- a/src/runtime/NEON/functions/NEROIAlignLayer.cpp +++ b/src/runtime/NEON/functions/NEROIAlignLayer.cpp @@ -29,14 +29,20 @@ namespace arm_compute { -Status NEROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status NEROIAlignLayer::validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_RETURN_ON_ERROR(NEROIAlignLayerKernel::validate(input, rois, output, pool_info)); return Status{}; } -void NEROIAlignLayer::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info) +void NEROIAlignLayer::configure(const ITensor *input, + const ITensor *rois, + ITensor *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info); diff --git a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp index a24f2aac50..babec4aa92 100644 --- a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp +++ b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp @@ -22,8 +22,10 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h" + #include "arm_compute/core/Helpers.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h" @@ -31,17 +33,22 @@ namespace arm_compute { NEROIPoolingLayer::~NEROIPoolingLayer() = default; -NEROIPoolingLayer::NEROIPoolingLayer() - : _roi_kernel() +NEROIPoolingLayer::NEROIPoolingLayer() : _roi_kernel() { } -Status NEROIPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info) +Status NEROIPoolingLayer::validate(const ITensorInfo *input, + const ITensorInfo *rois, + const ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info) { return NEROIPoolingLayerKernel::validate(input, rois, output, pool_info); } -void NEROIPoolingLayer::configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info) +void NEROIPoolingLayer::configure(const ITensor *input, + const ITensor *rois, + const ITensor *output, + const ROIPoolingLayerInfo &pool_info) { ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info); @@ -53,4 +60,4 @@ void NEROIPoolingLayer::run() { NEScheduler::get().schedule(_roi_kernel.get(), Window::DimX); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NERange.cpp b/src/runtime/NEON/functions/NERange.cpp index a6f7be8be0..95492df126 100644 --- a/src/runtime/NEON/functions/NERange.cpp +++ b/src/runtime/NEON/functions/NERange.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NERange.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NERangeKernel.h" @@ -31,8 +32,7 @@ namespace arm_compute { NERange::~NERange() = default; -NERange::NERange() - : _kernel() +NERange::NERange() : _kernel() { } @@ -52,4 +52,4 @@ void NERange::run() { NEScheduler::get().schedule(_kernel.get(), Window::DimX); } -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp index 9f96479295..d37cf4a8d0 100644 --- a/src/runtime/NEON/functions/NEReduceMean.cpp +++ b/src/runtime/NEON/functions/NEReduceMean.cpp @@ -25,21 +25,24 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/common/utils/Log.h" #include "src/core/CPP/Validate.h" -#include "src/core/NEON/kernels/NEReductionOperationKernel.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/core/NEON/kernels/NEReductionOperationKernel.h" namespace arm_compute { namespace { -Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +Status +validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) { ARM_COMPUTE_UNUSED(keep_dims); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1); ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions()); @@ -47,29 +50,29 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax const int input_dims = input->num_dimensions(); Coordinates axis_local = reduction_axis; - for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i) + for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i) { //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)). ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast(input->num_dimensions()))); ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast(input->num_dimensions())); } - if(output->tensor_shape().total_size() != 0) + if (output->tensor_shape().total_size() != 0) { // Only validate if not using auto_init for the output tensor TensorShape out_shape = input->tensor_shape(); // Validate output_shape only if not using auto_init convert_negative_axis(axis_local, input_dims); std::sort(axis_local.begin(), axis_local.begin() + reduction_ops); - for(unsigned int i = 0; i < reduction_ops; ++i) + for (unsigned int i = 0; i < reduction_ops; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3); ARM_COMPUTE_RETURN_ERROR_ON(static_cast(axis_local[i]) > input->num_dimensions() - 1); - if(output->total_size() > 0 && keep_dims) + if (output->total_size() > 0 && keep_dims) { ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1); } - if(keep_dims) + if (keep_dims) { out_shape.set(axis_local[i], 1); } @@ -91,11 +94,19 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax NEReduceMean::~NEReduceMean() = default; NEReduceMean::NEReduceMean(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims() + : _memory_group(std::move(memory_manager)), + _reduction_kernels(), + _reduced_outs(), + _reshape(), + _reduction_ops(), + _keep_dims() { } -Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output) +Status NEReduceMean::validate(const ITensorInfo *input, + const Coordinates &reduction_axis, + bool keep_dims, + const ITensorInfo *output) { return validate_config(input, reduction_axis, keep_dims, output); } @@ -107,7 +118,8 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(NEReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info())); // Output auto inizialitation if not yet initialized - const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims); + const TensorShape output_shape = + arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims); auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); _reduction_ops = reduction_axis.num_dimensions(); @@ -124,37 +136,40 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, convert_negative_axis(axis_local, input_dims); // Perform reduction for every axis - for(int i = 0; i < _reduction_ops; ++i) + for (int i = 0; i < _reduction_ops; ++i) { - TensorShape out_shape = i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); + TensorShape out_shape = + i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape(); out_shape.set(axis_local[i], 1); auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]); - if(i == _reduction_ops - 1 && keep_dims) + if (i == _reduction_ops - 1 && keep_dims) { _reduction_kernels[i].configure(in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM); } else { - _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_output->info()->num_channels(), tmp_output->info()->data_type(), tmp_output->info()->quantization_info())); + _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_output->info()->num_channels(), + tmp_output->info()->data_type(), + tmp_output->info()->quantization_info())); _memory_group.manage(&_reduced_outs[i]); _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM); } } // Allocate intermediate tensors - for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) + for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i) { _reduced_outs[i].allocator()->allocate(); } // Configure reshape layer if we want to drop the dimensions - if(!keep_dims) + if (!keep_dims) { TensorShape out_shape = tmp_input->info()->tensor_shape(); // We have to sort the reduction axis vectors in order for remove_dimension // to work properly std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops); - for(int i = 0; i < _reduction_ops; ++i) + for (int i = 0; i < _reduction_ops; ++i) { out_shape.remove_dimension(axis_local[i] - i, false); } @@ -166,11 +181,11 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, void NEReduceMean::run() { MemoryGroupResourceScope scope_mg(_memory_group); - for(auto &kernel : _reduction_kernels) + for (auto &kernel : _reduction_kernels) { kernel.run(); } - if(!_keep_dims) + if (!_keep_dims) { _reshape.run(); } diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp index 9660347a16..8540d750fc 100644 --- a/src/runtime/NEON/functions/NEReductionOperation.cpp +++ b/src/runtime/NEON/functions/NEReductionOperation.cpp @@ -26,9 +26,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" -#include "src/core/NEON/kernels/NEReductionOperationKernel.h" #include "src/core/helpers/AutoConfiguration.h" +#include "src/core/NEON/kernels/NEReductionOperationKernel.h" namespace arm_compute { @@ -42,7 +43,7 @@ namespace */ size_t reduction_window_split_dimension(unsigned int axis) { - switch(axis) + switch (axis) { case 0: return Window::DimY; @@ -59,13 +60,21 @@ size_t reduction_window_split_dimension(unsigned int axis) NEReductionOperation::~NEReductionOperation() = default; NEReductionOperation::NEReductionOperation(std::shared_ptr memory_manager) - : _memory_group(memory_manager), _reduction_kernel(), _reshape(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false) + : _memory_group(memory_manager), + _reduction_kernel(), + _reshape(), + _output_internal(), + _window_split(0), + _reduction_axis(), + _is_reshape_required(false) { } -Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims) +Status NEReductionOperation::validate( + const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); const auto is_reshape_required = !keep_dims; @@ -74,9 +83,10 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf TensorInfo info_before_reshape; - if(is_reshape_required) + if (is_reshape_required) { - const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims)); + const TensorInfo expected_output_shape = output->clone()->set_tensor_shape( + arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output); auto shape_before_reshape = input->tensor_shape(); @@ -84,17 +94,20 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf const auto input_num_channles = input->num_channels(); const auto input_qinfo = input->quantization_info(); - const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN); - const auto output_data_type = is_arg_min_max ? DataType::S32 : output->data_type(); + const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN); + const auto output_data_type = is_arg_min_max ? DataType::S32 : output->data_type(); - info_before_reshape.set_data_type(output_data_type).set_tensor_shape(shape_before_reshape).set_num_channels(input_num_channles).set_quantization_info(input_qinfo); + info_before_reshape.set_data_type(output_data_type) + .set_tensor_shape(shape_before_reshape) + .set_num_channels(input_num_channles) + .set_quantization_info(input_qinfo); output_internal = &info_before_reshape; } ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output_internal, axis, op)); - if(is_reshape_required) + if (is_reshape_required) { ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(output_internal, output)); } @@ -102,7 +115,8 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf return Status{}; } -void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) +void NEReductionOperation::configure( + ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims); @@ -112,19 +126,32 @@ void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned i auto *output_internal = output; const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN); - if(_is_reshape_required) + if (_is_reshape_required) { - const auto output_internal_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis); - const auto output_external_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); - const auto output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type(); - const auto num_channels = input->info()->num_channels(); - const auto qinfo = input->info()->quantization_info(); - - _output_internal.allocator()->init(input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_internal_shape).reset_padding().set_is_resizable(true).set_num_channels( - num_channels).set_quantization_info(qinfo)); + const auto output_internal_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis); + const auto output_external_shape = + arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false); + const auto output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type(); + const auto num_channels = input->info()->num_channels(); + const auto qinfo = input->info()->quantization_info(); + + _output_internal.allocator()->init(input->info() + ->clone() + ->set_data_type(output_data_type) + .set_tensor_shape(output_internal_shape) + .reset_padding() + .set_is_resizable(true) + .set_num_channels(num_channels) + .set_quantization_info(qinfo)); _memory_group.manage(&_output_internal); output_internal = &_output_internal; - auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_external_shape).reset_padding().set_is_resizable(true)); + auto_init_if_empty(*output->info(), input->info() + ->clone() + ->set_data_type(output_data_type) + .set_tensor_shape(output_external_shape) + .reset_padding() + .set_is_resizable(true)); } ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op, keep_dims)); @@ -135,7 +162,7 @@ void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned i _window_split = reduction_window_split_dimension(axis); _reduction_axis = axis; - if(_is_reshape_required) + if (_is_reshape_required) { _reshape.configure(output_internal, output); _output_internal.allocator()->allocate(); @@ -146,7 +173,7 @@ void NEReductionOperation::run() { MemoryGroupResourceScope scope_mg(_memory_group); NEScheduler::get().schedule(_reduction_kernel.get(), _window_split); - if(_is_reshape_required) + if (_is_reshape_required) { _reshape.run(); } diff --git a/src/runtime/NEON/functions/NEReorderLayer.cpp b/src/runtime/NEON/functions/NEReorderLayer.cpp index 427bf8c501..89cf575f38 100644 --- a/src/runtime/NEON/functions/NEReorderLayer.cpp +++ b/src/runtime/NEON/functions/NEReorderLayer.cpp @@ -23,20 +23,24 @@ */ #if defined(__aarch64__) -#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/NEON/functions/NEReorderLayer.h" + +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/core/NEON/kernels/NEReorderKernel.h" namespace arm_compute { NEReorderLayer::~NEReorderLayer() = default; -NEReorderLayer::NEReorderLayer() - : _reorder_kernel(std::make_unique()) +NEReorderLayer::NEReorderLayer() : _reorder_kernel(std::make_unique()) { } -void NEReorderLayer::configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf) +void NEReorderLayer::configure(const ITensor *input, + ITensor *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf) { auto k = std::make_unique(); k->configure(input, output, input_wf, output_wf); @@ -49,11 +53,14 @@ void NEReorderLayer::run() NEScheduler::get().schedule(_reorder_kernel.get(), Window::DimX); } -Status NEReorderLayer::validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf) +Status NEReorderLayer::validate(const ITensorInfo *input, + const ITensorInfo *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf) { return NEReorderKernel::validate(input, output, input_wf, output_wf); } } // namespace arm_compute -#endif // defined(__aarch64__) \ No newline at end of file +#endif // defined(__aarch64__) diff --git a/src/runtime/NEON/functions/NEReorgLayer.cpp b/src/runtime/NEON/functions/NEReorgLayer.cpp index 8ee73d7390..14e41d6df4 100644 --- a/src/runtime/NEON/functions/NEReorgLayer.cpp +++ b/src/runtime/NEON/functions/NEReorgLayer.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEReorgLayer.h" -#include "src/core/NEON/kernels/NEReorgLayerKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEReorgLayerKernel.h" namespace arm_compute { diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp index 3ccb42361e..bed70ff66c 100644 --- a/src/runtime/NEON/functions/NEReshapeLayer.cpp +++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" #include "arm_compute/core/Validate.h" + #include "src/cpu/operators/CpuReshape.h" #include @@ -32,16 +33,15 @@ namespace arm_compute { struct NEReshapeLayer::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEReshapeLayer::NEReshapeLayer() - : _impl(std::make_unique()) +NEReshapeLayer::NEReshapeLayer() : _impl(std::make_unique()) { } -NEReshapeLayer::NEReshapeLayer(NEReshapeLayer &&) = default; +NEReshapeLayer::NEReshapeLayer(NEReshapeLayer &&) = default; NEReshapeLayer &NEReshapeLayer::operator=(NEReshapeLayer &&) = default; NEReshapeLayer::~NEReshapeLayer() = default; diff --git a/src/runtime/NEON/functions/NEReverse.cpp b/src/runtime/NEON/functions/NEReverse.cpp index e1988f2ab3..a90f8d2e76 100644 --- a/src/runtime/NEON/functions/NEReverse.cpp +++ b/src/runtime/NEON/functions/NEReverse.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NEReverse.h" -#include "src/core/NEON/kernels/NEReverseKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NEReverseKernel.h" namespace arm_compute { @@ -38,7 +37,10 @@ void NEReverse::configure(const ITensor *input, ITensor *output, const ITensor * _kernel = std::move(k); } -Status NEReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis) +Status NEReverse::validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *axis, + bool use_inverted_axis) { return NEReverseKernel::validate(input, output, axis, use_inverted_axis); } diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp index 09f037334e..0d011064f6 100644 --- a/src/runtime/NEON/functions/NEScale.cpp +++ b/src/runtime/NEON/functions/NEScale.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NEScale.h" #include "arm_compute/runtime/Tensor.h" + #include "src/common/utils/Log.h" #include "src/core/utils/ScaleUtils.h" #include "src/cpu/operators/CpuScale.h" @@ -32,16 +33,16 @@ namespace arm_compute { struct NEScale::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - Tensor dx{ nullptr }; /**< Element's distance between the X real coordinate and the smallest X following integer */ - Tensor dy{ nullptr }; /**< Element's distance between the Y real coordinate and the smallest Y following integer */ - Tensor offsets{ nullptr }; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */ - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + Tensor dx{nullptr}; /**< Element's distance between the X real coordinate and the smallest X following integer */ + Tensor dy{nullptr}; /**< Element's distance between the Y real coordinate and the smallest Y following integer */ + Tensor offsets{ + nullptr}; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */ + std::unique_ptr op{nullptr}; }; -NEScale::NEScale() - : _impl(std::make_unique()) +NEScale::NEScale() : _impl(std::make_unique()) { } NEScale::~NEScale() = default; @@ -57,25 +58,33 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo & // Configure for size of allocation of internal tensors // Get data layout and width/height indices - const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : info.data_layout; - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const DataLayout data_layout = + info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : info.data_layout; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); // Compute the ratio between source width/height and destination width/height - const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy); - const auto wr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used); - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used); + const bool is_align_corners_used = + info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy); + const auto wr = arm_compute::scale_utils::calculate_resize_ratio( + input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used); + const auto hr = arm_compute::scale_utils::calculate_resize_ratio( + input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used); // Area interpolation behaves as Nearest Neighbour in case of up-sampling - InterpolationPolicy policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy; + InterpolationPolicy policy_to_use = + (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + ? InterpolationPolicy::NEAREST_NEIGHBOR + : info.interpolation_policy; // Get the tensor shape TensorShape shape(output->info()->dimension(idx_width)); shape.set(1, output->info()->dimension(idx_height), false); - bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(data_layout, input->info()->data_type(), policy_to_use, info.border_mode); + bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required( + data_layout, input->info()->data_type(), policy_to_use, info.border_mode); - if(precompute_indices_weights) + if (precompute_indices_weights) { const TensorInfo tensor_info_dxdy(shape, Format::F32); const TensorInfo tensor_info_offsets(shape, Format::S32); @@ -83,7 +92,7 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo & _impl->dx.allocator()->init(tensor_info_dxdy); _impl->dy.allocator()->init(tensor_info_dxdy); _impl->offsets.allocator()->init(tensor_info_offsets); - switch(policy_to_use) + switch (policy_to_use) { case InterpolationPolicy::NEAREST_NEIGHBOR: { @@ -109,7 +118,8 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo & } else { - if(policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA) + if (policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR && + policy_to_use != InterpolationPolicy::AREA) { ARM_COMPUTE_ERROR("Unsupported interpolation mode"); } diff --git a/src/runtime/NEON/functions/NESelect.cpp b/src/runtime/NEON/functions/NESelect.cpp index 26c2eb8fe9..55cad2202b 100644 --- a/src/runtime/NEON/functions/NESelect.cpp +++ b/src/runtime/NEON/functions/NESelect.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NESelect.h" #include "arm_compute/core/Types.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NESelectKernel.h" diff --git a/src/runtime/NEON/functions/NESlice.cpp b/src/runtime/NEON/functions/NESlice.cpp index 4a8912bfe9..12d43adc84 100644 --- a/src/runtime/NEON/functions/NESlice.cpp +++ b/src/runtime/NEON/functions/NESlice.cpp @@ -25,8 +25,9 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/tensor_transform.h" +#include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEStridedSliceKernel.h" @@ -34,7 +35,10 @@ namespace arm_compute { namespace experimental { -void NESlice::configure(const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +void NESlice::configure(const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends); @@ -47,15 +51,16 @@ void NESlice::configure(const ITensorInfo *input, ITensorInfo *output, const Coo _kernel = std::move(k); } -Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +Status NESlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); // Check start dimensions for being non-negative - ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) - { - return i < 0; - })); + ARM_COMPUTE_RETURN_ERROR_ON( + std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; })); // Get absolute end coordinates const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends); @@ -66,20 +71,22 @@ Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, co struct NESlice::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NESlice::NESlice() - : _impl(std::make_unique()) +NESlice::NESlice() : _impl(std::make_unique()) { } -NESlice::NESlice(NESlice &&) = default; +NESlice::NESlice(NESlice &&) = default; NESlice &NESlice::operator=(NESlice &&) = default; NESlice::~NESlice() = default; -Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends) +Status NESlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends) { return experimental::NESlice::validate(input, output, starts, ends); } diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp index 0947ff94a6..e3c2012d05 100644 --- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp +++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp @@ -22,9 +22,11 @@ * SOFTWARE. */ #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h" + #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" + #include "src/core/helpers/MemoryHelpers.h" #include "src/core/helpers/SoftmaxHelpers.h" #include "src/cpu/kernels/CpuSoftmaxKernel.h" @@ -35,10 +37,10 @@ namespace arm_compute template struct NESoftmaxLayerGeneric::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - Tensor max{ nullptr }; - std::unique_ptr> op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + Tensor max{nullptr}; + std::unique_ptr> op{nullptr}; MemoryGroup memory_group{}; ITensorPack run_pack{}; WorkspaceData workspace_tensors{}; @@ -53,9 +55,9 @@ NESoftmaxLayerGeneric::NESoftmaxLayerGeneric(std::shared_ptr NESoftmaxLayerGeneric::NESoftmaxLayerGeneric(NESoftmaxLayerGeneric &&) = default; -template +template NESoftmaxLayerGeneric &NESoftmaxLayerGeneric::operator=(NESoftmaxLayerGeneric &&) = default; -template +template NESoftmaxLayerGeneric::~NESoftmaxLayerGeneric() = default; template @@ -68,12 +70,13 @@ void NESoftmaxLayerGeneric::configure(ITensor *input, ITensor *output, f _impl->op = std::make_unique>(); _impl->op->configure(input->info(), output->info(), beta, axis); - _impl->run_pack = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST, _impl->dst } }; + _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}}; _impl->workspace_tensors = manage_workspace(_impl->op->workspace(), _impl->memory_group, _impl->run_pack); } template -Status NESoftmaxLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) +Status +NESoftmaxLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric::validate(input, output, beta, axis)); @@ -81,7 +84,7 @@ Status NESoftmaxLayerGeneric::validate(const ITensorInfo *input, const I } template -void NESoftmaxLayerGeneric::run() +void NESoftmaxLayerGeneric::run() { // Acquire all the temporaries MemoryGroupResourceScope scope_mg(_impl->memory_group); diff --git a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp index c4509510dc..556ebdd800 100644 --- a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp +++ b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp @@ -28,8 +28,9 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/NEON/functions/NEFill.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h" @@ -37,17 +38,19 @@ namespace arm_compute { NESpaceToBatchLayer::~NESpaceToBatchLayer() = default; -NESpaceToBatchLayer::NESpaceToBatchLayer() - : _space_to_batch_kernel(), _fill_f(), _has_padding(false) +NESpaceToBatchLayer::NESpaceToBatchLayer() : _space_to_batch_kernel(), _fill_f(), _has_padding(false) { } -void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output) +void NESpaceToBatchLayer::configure(const ITensor *input, + const ITensor *block_shape, + const ITensor *paddings, + ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output); ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output); - if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; _fill_f = std::make_unique(); @@ -57,11 +60,16 @@ void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_s _space_to_batch_kernel->configure(input, block_shape, paddings, output); } -void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output) +void NESpaceToBatchLayer::configure(const ITensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ITensor *output) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) + if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size()) { _has_padding = true; _fill_f = std::make_unique(); @@ -71,17 +79,25 @@ void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_ _space_to_batch_kernel->configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output); } -Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output) +Status NESpaceToBatchLayer::validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output)); return Status{}; } -Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, +Status NESpaceToBatchLayer::validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); + ARM_COMPUTE_RETURN_ON_ERROR( + NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output)); return Status{}; } @@ -89,7 +105,7 @@ Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_s void NESpaceToBatchLayer::run() { // Zero out output only if we have paddings - if(_has_padding) + if (_has_padding) { _fill_f->run(); } diff --git a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp index b37bf0d20f..846b619429 100644 --- a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp +++ b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NESpaceToDepthLayerKernel.h" @@ -36,8 +37,7 @@ namespace arm_compute { NESpaceToDepthLayer::~NESpaceToDepthLayer() = default; -NESpaceToDepthLayer::NESpaceToDepthLayer() - : _space_to_depth_kernel() +NESpaceToDepthLayer::NESpaceToDepthLayer() : _space_to_depth_kernel() { } diff --git a/src/runtime/NEON/functions/NESplit.cpp b/src/runtime/NEON/functions/NESplit.cpp index db19bbb824..53b09e9ae5 100644 --- a/src/runtime/NEON/functions/NESplit.cpp +++ b/src/runtime/NEON/functions/NESplit.cpp @@ -34,7 +34,7 @@ namespace arm_compute { void NESplit::run() { - for(unsigned i = 0; i < _num_outputs; ++i) + for (unsigned i = 0; i < _num_outputs; ++i) { _slice_functions[i].run(); } diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp index 68554e0931..03e7026691 100644 --- a/src/runtime/NEON/functions/NEStackLayer.cpp +++ b/src/runtime/NEON/functions/NEStackLayer.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/NEON/NEScheduler.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEStackLayerKernel.h" @@ -38,9 +39,7 @@ namespace arm_compute NEStackLayer::~NEStackLayer() = default; NEStackLayer::NEStackLayer() // NOLINT - : _input(), - _stack_kernels(), - _num_inputs(0) + : _input(), _stack_kernels(), _num_inputs(0) { } @@ -54,7 +53,7 @@ void NEStackLayer::configure(const std::vector &input, int axis, ITen // Wrap around negative values const unsigned int axis_u = wrap_around(axis, static_cast(input[0]->info()->num_dimensions() + 1)); - for(unsigned int i = 0; i < _num_inputs; i++) + for (unsigned int i = 0; i < _num_inputs; i++) { _stack_kernels[i] = std::make_unique(); _stack_kernels[i]->configure(input[i], axis_u, i, _num_inputs, output); @@ -72,7 +71,7 @@ Status NEStackLayer::validate(const std::vector &input, int axis, const unsigned int num_inputs = input.size(); - for(unsigned int i = 0; i < num_inputs; i++) + for (unsigned int i = 0; i < num_inputs; i++) { // All the tensors must have the same rank ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank); @@ -85,7 +84,7 @@ Status NEStackLayer::validate(const std::vector &input, int axis, void NEStackLayer::run() { - for(unsigned i = 0; i < _num_inputs; i++) + for (unsigned i = 0; i < _num_inputs; i++) { NEScheduler::get().schedule(_stack_kernels[i].get(), Window::DimY); } diff --git a/src/runtime/NEON/functions/NEStridedSlice.cpp b/src/runtime/NEON/functions/NEStridedSlice.cpp index 4f50749a4f..6a3ac8be05 100644 --- a/src/runtime/NEON/functions/NEStridedSlice.cpp +++ b/src/runtime/NEON/functions/NEStridedSlice.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" + #include "src/common/utils/Log.h" #include "src/core/NEON/kernels/NEStridedSliceKernel.h" @@ -32,9 +33,14 @@ namespace arm_compute { namespace experimental { -void NEStridedSlice::configure(const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void NEStridedSlice::configure(const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); @@ -43,9 +49,14 @@ void NEStridedSlice::configure(const ITensorInfo *input, ITensorInfo *output, _kernel = std::move(k); } -Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status NEStridedSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { return NEStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); } @@ -53,22 +64,26 @@ Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *out struct NEStridedSlice::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NEStridedSlice::NEStridedSlice() - : _impl(std::make_unique()) +NEStridedSlice::NEStridedSlice() : _impl(std::make_unique()) { } -NEStridedSlice::NEStridedSlice(NEStridedSlice &&) = default; +NEStridedSlice::NEStridedSlice(NEStridedSlice &&) = default; NEStridedSlice &NEStridedSlice::operator=(NEStridedSlice &&) = default; NEStridedSlice::~NEStridedSlice() = default; -void NEStridedSlice::configure(const ITensor *input, ITensor *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +void NEStridedSlice::configure(const ITensor *input, + ITensor *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { _impl->src = input; _impl->dst = output; @@ -84,10 +99,16 @@ void NEStridedSlice::run() _impl->op->run(pack); } -Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask) +Status NEStridedSlice::validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask, + int32_t end_mask, + int32_t shrink_axis_mask) { - return experimental::NEStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask); + return experimental::NEStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, + shrink_axis_mask); } } // namespace arm_compute diff --git a/src/runtime/NEON/functions/NETile.cpp b/src/runtime/NEON/functions/NETile.cpp index 526603f1a3..d10b1c8e95 100644 --- a/src/runtime/NEON/functions/NETile.cpp +++ b/src/runtime/NEON/functions/NETile.cpp @@ -23,9 +23,8 @@ */ #include "arm_compute/runtime/NEON/functions/NETile.h" -#include "src/core/NEON/kernels/NETileKernel.h" - #include "src/common/utils/Log.h" +#include "src/core/NEON/kernels/NETileKernel.h" namespace arm_compute { diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp index 78c7ea202a..0144a85e8c 100644 --- a/src/runtime/NEON/functions/NETranspose.cpp +++ b/src/runtime/NEON/functions/NETranspose.cpp @@ -24,6 +24,7 @@ #include "arm_compute/runtime/NEON/functions/NETranspose.h" #include "arm_compute/core/Validate.h" + #include "src/common/utils/Log.h" #include "src/cpu/operators/CpuTranspose.h" @@ -31,13 +32,12 @@ namespace arm_compute { struct NETranspose::Impl { - const ITensor *src{ nullptr }; - ITensor *dst{ nullptr }; - std::unique_ptr op{ nullptr }; + const ITensor *src{nullptr}; + ITensor *dst{nullptr}; + std::unique_ptr op{nullptr}; }; -NETranspose::NETranspose() - : _impl(std::make_unique()) +NETranspose::NETranspose() : _impl(std::make_unique()) { } diff --git a/src/runtime/NEON/functions/NEUnstack.cpp b/src/runtime/NEON/functions/NEUnstack.cpp index 0ffab5e92a..2f7ed2bb1f 100644 --- a/src/runtime/NEON/functions/NEUnstack.cpp +++ b/src/runtime/NEON/functions/NEUnstack.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/common/utils/Log.h" namespace arm_compute @@ -39,13 +40,15 @@ inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor) return wrap_around(axis, static_cast(tensor->num_dimensions())); } -inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions) +inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, + int32_t &slice_end_mask, + const unsigned int input_num_dimensions) { // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time. Coordinates slice_end; slice_start.set_num_dimensions(input_num_dimensions); slice_end.set_num_dimensions(input_num_dimensions); - for(size_t k = 0; k < input_num_dimensions; ++k) + for (size_t k = 0; k < input_num_dimensions; ++k) { slice_start.set(k, 0); slice_end.set(k, -1); @@ -55,19 +58,19 @@ inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t & } // namespace NEUnstack::NEUnstack() // NOLINT - : _num_slices(0), - _strided_slice_vector() + : _num_slices(0), _strided_slice_vector() { } void NEUnstack::configure(const ITensor *input, const std::vector &output_vector, int axis) { std::vector outputs_vector_info(output_vector.size()); - std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ITensor * t) - { - ARM_COMPUTE_ERROR_ON_NULLPTR(t); - return t->info(); - }); + std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), + [](ITensor *t) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(t); + return t->info(); + }); ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_ERROR_THROW_ON(NEUnstack::validate(input->info(), outputs_vector_info, axis)); @@ -81,11 +84,12 @@ void NEUnstack::configure(const ITensor *input, const std::vector &ou Coordinates slice_start; int32_t slice_end_mask; setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions()); - for(unsigned int slice = 0; slice < _num_slices; ++slice) + for (unsigned int slice = 0; slice < _num_slices; ++slice) { // Adjusts start and end coordinates to take a 2D slice at a time slice_start.set(axis_u, slice); - _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u)); + _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, + slice_end_mask, (1 << axis_u)); } } @@ -102,18 +106,20 @@ Status NEUnstack::validate(const ITensorInfo *input, const std::vectortensor_shape().num_dimensions()); - ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input)))); + ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), + BiStrides(), 0, slice_end_mask, + (1 << wrap_axis(axis, input)))); } return Status{}; } void NEUnstack::run() { - for(unsigned i = 0; i < _num_slices; ++i) + for (unsigned i = 0; i < _num_slices; ++i) { _strided_slice_vector[i].run(); } diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp index a8eded29ff..8d77abcfc7 100644 --- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp @@ -26,15 +26,15 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/MemoryHelpers.h" +#include "src/core/NEON/kernels/convolution/common/utils.hpp" #include "src/cpu/kernels/CpuWinogradConv2dKernel.h" #include "src/cpu/operators/CpuWinogradConv2d.h" -#include "src/core/NEON/kernels/convolution/common/utils.hpp" - namespace arm_compute { using namespace arm_compute::experimental; @@ -42,14 +42,14 @@ using namespace arm_compute::experimental; struct NEWinogradConvolutionLayer::Impl { MemoryGroup memory_group{}; - std::unique_ptr op{ nullptr }; + std::unique_ptr op{nullptr}; ITensorPack run_pack{}; ITensorPack prep_pack{}; WorkspaceData workspace{}; experimental::MemoryRequirements aux_mem_req{}; - const ITensor *original_weights{ nullptr }; - bool is_prepared{ false }; - bool is_activationlayer_enabled{ false }; + const ITensor *original_weights{nullptr}; + bool is_prepared{false}; + bool is_activationlayer_enabled{false}; DataLayout data_layout{}; }; @@ -61,17 +61,24 @@ NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(const std::shared_ptroriginal_weights = weights; _impl->op = std::make_unique(); - _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, act_info, enable_fast_math); + _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), + conv_info, act_info, enable_fast_math); _impl->aux_mem_req = _impl->op->workspace(); - _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } }; - _impl->prep_pack = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } }; - _impl->workspace = manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); + _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}}; + _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}}; + _impl->workspace = + manage_workspace(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack); } void NEWinogradConvolutionLayer::run() @@ -82,15 +89,20 @@ void NEWinogradConvolutionLayer::run() _impl->op->run(_impl->run_pack); } -Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { return cpu::CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math); } void NEWinogradConvolutionLayer::prepare() { - if(!_impl->is_prepared) + if (!_impl->is_prepared) { _impl->op->prepare(_impl->prep_pack); _impl->original_weights->mark_as_unused(); -- cgit v1.2.1