aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/NEON/functions
diff options
context:
space:
mode:
Diffstat (limited to 'src/runtime/NEON/functions')
-rw-r--r--src/runtime/NEON/functions/NEActivationLayer.cpp19
-rw-r--r--src/runtime/NEON/functions/NEAddMulAdd.cpp89
-rw-r--r--src/runtime/NEON/functions/NEArgMinMaxLayer.cpp55
-rw-r--r--src/runtime/NEON/functions/NEArithmeticAddition.cpp28
-rw-r--r--src/runtime/NEON/functions/NEArithmeticSubtraction.cpp28
-rw-r--r--src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp29
-rw-r--r--src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp21
-rw-r--r--src/runtime/NEON/functions/NEBitwiseAnd.cpp4
-rw-r--r--src/runtime/NEON/functions/NEBitwiseNot.cpp4
-rw-r--r--src/runtime/NEON/functions/NEBitwiseOr.cpp4
-rw-r--r--src/runtime/NEON/functions/NEBitwiseXor.cpp4
-rw-r--r--src/runtime/NEON/functions/NEBoundingBoxTransform.cpp15
-rw-r--r--src/runtime/NEON/functions/NECast.cpp23
-rw-r--r--src/runtime/NEON/functions/NEChannelShuffleLayer.cpp5
-rw-r--r--src/runtime/NEON/functions/NEConcatenateLayer.cpp30
-rw-r--r--src/runtime/NEON/functions/NEConv3D.cpp87
-rw-r--r--src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp27
-rw-r--r--src/runtime/NEON/functions/NEConvolutionLayer.cpp281
-rw-r--r--src/runtime/NEON/functions/NECopy.cpp14
-rw-r--r--src/runtime/NEON/functions/NECropResize.cpp56
-rw-r--r--src/runtime/NEON/functions/NEDeconvolutionLayer.cpp197
-rw-r--r--src/runtime/NEON/functions/NEDepthConvertLayer.cpp19
-rw-r--r--src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp21
-rw-r--r--src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp205
-rw-r--r--src/runtime/NEON/functions/NEDequantizationLayer.cpp12
-rw-r--r--src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp62
-rw-r--r--src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp29
-rw-r--r--src/runtime/NEON/functions/NEElementwiseOperations.cpp152
-rw-r--r--src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp17
-rw-r--r--src/runtime/NEON/functions/NEFFT1D.cpp33
-rw-r--r--src/runtime/NEON/functions/NEFFT2D.cpp15
-rw-r--r--src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp107
-rw-r--r--src/runtime/NEON/functions/NEFill.cpp12
-rw-r--r--src/runtime/NEON/functions/NEFillBorder.cpp13
-rw-r--r--src/runtime/NEON/functions/NEFlattenLayer.cpp24
-rw-r--r--src/runtime/NEON/functions/NEFloor.cpp14
-rw-r--r--src/runtime/NEON/functions/NEFullyConnectedLayer.cpp495
-rw-r--r--src/runtime/NEON/functions/NEFuseBatchNormalization.cpp44
-rw-r--r--src/runtime/NEON/functions/NEGEMM.cpp418
-rw-r--r--src/runtime/NEON/functions/NEGEMMConv2d.cpp79
-rw-r--r--src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp658
-rw-r--r--src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp631
-rw-r--r--src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp178
-rw-r--r--src/runtime/NEON/functions/NEGather.cpp4
-rw-r--r--src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp189
-rw-r--r--src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp31
-rw-r--r--src/runtime/NEON/functions/NEL2NormalizeLayer.cpp9
-rw-r--r--src/runtime/NEON/functions/NELSTMLayer.cpp518
-rw-r--r--src/runtime/NEON/functions/NELSTMLayerQuantized.cpp404
-rw-r--r--src/runtime/NEON/functions/NELogical.cpp16
-rw-r--r--src/runtime/NEON/functions/NEMatMul.cpp85
-rw-r--r--src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp54
-rw-r--r--src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp5
-rw-r--r--src/runtime/NEON/functions/NENormalizationLayer.cpp14
-rw-r--r--src/runtime/NEON/functions/NEPReluLayer.cpp16
-rw-r--r--src/runtime/NEON/functions/NEPadLayer.cpp95
-rw-r--r--src/runtime/NEON/functions/NEPermute.cpp12
-rw-r--r--src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp52
-rw-r--r--src/runtime/NEON/functions/NEPooling3dLayer.cpp76
-rw-r--r--src/runtime/NEON/functions/NEPoolingLayer.cpp23
-rw-r--r--src/runtime/NEON/functions/NEPriorBoxLayer.cpp18
-rw-r--r--src/runtime/NEON/functions/NEQLSTMLayer.cpp1168
-rw-r--r--src/runtime/NEON/functions/NEQuantizationLayer.cpp12
-rw-r--r--src/runtime/NEON/functions/NERNNLayer.cpp55
-rw-r--r--src/runtime/NEON/functions/NEROIAlignLayer.cpp15
-rw-r--r--src/runtime/NEON/functions/NEROIPoolingLayer.cpp20
-rw-r--r--src/runtime/NEON/functions/NERange.cpp10
-rw-r--r--src/runtime/NEON/functions/NEReduceMean.cpp117
-rw-r--r--src/runtime/NEON/functions/NEReductionOperation.cpp79
-rw-r--r--src/runtime/NEON/functions/NERemap.cpp49
-rw-r--r--src/runtime/NEON/functions/NEReorderLayer.cpp66
-rw-r--r--src/runtime/NEON/functions/NEReorgLayer.cpp5
-rw-r--r--src/runtime/NEON/functions/NEReshapeLayer.cpp14
-rw-r--r--src/runtime/NEON/functions/NEReverse.cpp16
-rw-r--r--src/runtime/NEON/functions/NEScale.cpp105
-rw-r--r--src/runtime/NEON/functions/NESelect.cpp6
-rw-r--r--src/runtime/NEON/functions/NESlice.cpp39
-rw-r--r--src/runtime/NEON/functions/NESoftmaxLayer.cpp37
-rw-r--r--src/runtime/NEON/functions/NESpaceToBatchLayer.cpp40
-rw-r--r--src/runtime/NEON/functions/NESpaceToDepthLayer.cpp9
-rw-r--r--src/runtime/NEON/functions/NESplit.cpp2
-rw-r--r--src/runtime/NEON/functions/NEStackLayer.cpp35
-rw-r--r--src/runtime/NEON/functions/NEStridedSlice.cpp64
-rw-r--r--src/runtime/NEON/functions/NETile.cpp5
-rw-r--r--src/runtime/NEON/functions/NETranspose.cpp14
-rw-r--r--src/runtime/NEON/functions/NEUnstack.cpp38
-rw-r--r--src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp778
87 files changed, 4057 insertions, 4620 deletions
diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
index 2b5c51fa5a..59199452ce 100644
--- a/src/runtime/NEON/functions/NEActivationLayer.cpp
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp
@@ -24,24 +24,24 @@
#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
#include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuActivation.h"
+
+#include "src/cpu/operators/CpuActivation.h"
namespace arm_compute
{
struct NEActivationLayer::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- IRuntimeContext *ctx{ nullptr };
- std::unique_ptr<cpu::CpuActivation> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ IRuntimeContext *ctx{nullptr};
+ std::unique_ptr<cpu::CpuActivation> op{nullptr};
};
-NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx)
- : _impl(std::make_unique<Impl>())
+NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx) : _impl(std::make_unique<Impl>())
{
_impl->ctx = ctx;
}
-NEActivationLayer::NEActivationLayer(NEActivationLayer &&) = default;
+NEActivationLayer::NEActivationLayer(NEActivationLayer &&) = default;
NEActivationLayer &NEActivationLayer::operator=(NEActivationLayer &&) = default;
NEActivationLayer::~NEActivationLayer() = default;
@@ -56,7 +56,8 @@ void NEActivationLayer::configure(ITensor *input, ITensor *output, ActivationLay
_impl->op->configure(_impl->src->info(), _impl->dst->info(), activation_info);
}
-Status NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status
+NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
{
return cpu::CpuActivation::validate(input, output, act_info);
}
diff --git a/src/runtime/NEON/functions/NEAddMulAdd.cpp b/src/runtime/NEON/functions/NEAddMulAdd.cpp
new file mode 100644
index 0000000000..a72364791c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEAddMulAdd.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/NEON/functions/NEAddMulAdd.h"
+
+#include "arm_compute/runtime/Tensor.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuAddMulAdd.h"
+
+namespace arm_compute
+{
+struct NEAddMulAdd::Impl
+{
+ std::unique_ptr<cpu::CpuAddMulAdd> op{nullptr};
+ WorkspaceData<Tensor> workspace_tensors{};
+ ITensorPack run_pack{};
+ MemoryGroup memory_group{};
+};
+
+NEAddMulAdd::NEAddMulAdd(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
+{
+ _impl->memory_group = MemoryGroup(std::move(memory_manager));
+}
+
+NEAddMulAdd::~NEAddMulAdd() = default;
+
+void NEAddMulAdd::configure(ITensor *input1,
+ ITensor *input2,
+ ITensor *bn_mul,
+ ITensor *bn_add,
+ ITensor *add_output,
+ ITensor *final_output,
+ const ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
+
+ _impl->op = std::make_unique<cpu::CpuAddMulAdd>();
+ _impl->op->configure(input1->info(), input2->info(), bn_mul->info(), bn_add->info(),
+ add_output != nullptr ? add_output->info() : nullptr, final_output->info(), policy, act_info);
+
+ _impl->run_pack = {
+ {TensorType::ACL_SRC_0, input1}, {TensorType::ACL_SRC_1, input2}, {TensorType::ACL_SRC_2, bn_mul},
+ {TensorType::ACL_SRC_3, bn_add}, {TensorType::ACL_DST_0, add_output}, {TensorType::ACL_DST_1, final_output},
+ };
+
+ _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
+}
+
+Status NEAddMulAdd::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *bn_mul,
+ const ITensorInfo *bn_add,
+ const ITensorInfo *add_output,
+ const ITensorInfo *final_output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
+{
+ return cpu::CpuAddMulAdd::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
+}
+
+void NEAddMulAdd::run()
+{
+ _impl->op->run(_impl->run_pack);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
index 7bca20d46c..fbaf1a96e7 100644
--- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,31 +29,68 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/functions/NECast.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
namespace arm_compute
{
+struct NEArgMinMaxLayer::Impl
+{
+ MemoryGroup memory_group{};
+ std::shared_ptr<IMemoryManager> memory_manager{};
+ std::unique_ptr<NEReductionOperation> reduction_function{};
+ std::unique_ptr<NECast> cast_function{};
+ std::unique_ptr<Tensor> tmp_reduction_result{};
+};
+
NEArgMinMaxLayer::~NEArgMinMaxLayer() = default;
-NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _reduction_function(std::make_unique<NEReductionOperation>())
+NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
{
- ARM_COMPUTE_UNUSED(memory_manager);
+ _impl->memory_manager = std::move(memory_manager);
}
+
void NEArgMinMaxLayer::configure(ITensor *input, int axis, ITensor *output, const ReductionOperation &op)
{
- _reduction_function->configure(input, output, axis, op, false);
+ ARM_COMPUTE_LOG_PARAMS(input, axis, output, op);
+ _impl->reduction_function = std::make_unique<NEReductionOperation>();
+ if (output->info() &&
+ (output->info()->data_type() == DataType::S64 || output->info()->data_type() == DataType::U64))
+ {
+ _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager));
+ _impl->cast_function = std::make_unique<NECast>();
+ _impl->tmp_reduction_result = std::make_unique<Tensor>();
+ _impl->reduction_function->configure(input, _impl->tmp_reduction_result.get(), axis, op, false);
+ _impl->cast_function->configure(_impl->tmp_reduction_result.get(), output, ConvertPolicy::SATURATE);
+ _impl->memory_group.manage(_impl->tmp_reduction_result.get());
+ _impl->tmp_reduction_result->allocator()->allocate();
+ }
+ else
+ {
+ _impl->reduction_function->configure(input, output, axis, op, false);
+ }
}
-Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+Status
+NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN,
+ "Invalid operation");
return NEReductionOperation::validate(input, output, axis, op, false);
}
void NEArgMinMaxLayer::run()
{
- _reduction_function->run();
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
+ _impl->reduction_function->run();
+ if (_impl->tmp_reduction_result != nullptr)
+ {
+ _impl->cast_function->run();
+ }
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index 2e4755b949..aff16ae9d1 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
@@ -24,7 +24,8 @@
#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
#include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuAdd.h"
+
+#include "src/cpu/operators/CpuAdd.h"
#include <utility>
@@ -32,26 +33,33 @@ namespace arm_compute
{
struct NEArithmeticAddition::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuAdd> op{ nullptr };
+ const ITensor *src_0{nullptr};
+ const ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuAdd> op{nullptr};
};
-NEArithmeticAddition::NEArithmeticAddition()
- : _impl(std::make_unique<Impl>())
+NEArithmeticAddition::NEArithmeticAddition() : _impl(std::make_unique<Impl>())
{
}
-NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&) = default;
+NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&) = default;
NEArithmeticAddition &NEArithmeticAddition::operator=(NEArithmeticAddition &&) = default;
NEArithmeticAddition::~NEArithmeticAddition() = default;
-Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status NEArithmeticAddition::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
return cpu::CpuAdd::validate(input1, input2, output, policy, act_info);
}
-void NEArithmeticAddition::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void NEArithmeticAddition::configure(const ITensor *input1,
+ const ITensor *input2,
+ ITensor *output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
_impl->src_1 = input2;
diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
index 0263d4cbb6..097525c1a8 100644
--- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
@@ -24,7 +24,8 @@
#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
#include "arm_compute/core/ITensor.h"
-#include "src/runtime/cpu/operators/CpuSub.h"
+
+#include "src/cpu/operators/CpuSub.h"
#include <utility>
@@ -32,26 +33,33 @@ namespace arm_compute
{
struct NEArithmeticSubtraction::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuSub> op{ nullptr };
+ const ITensor *src_0{nullptr};
+ const ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuSub> op{nullptr};
};
-NEArithmeticSubtraction::NEArithmeticSubtraction()
- : _impl(std::make_unique<Impl>())
+NEArithmeticSubtraction::NEArithmeticSubtraction() : _impl(std::make_unique<Impl>())
{
}
-NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&) = default;
+NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&) = default;
NEArithmeticSubtraction &NEArithmeticSubtraction::operator=(NEArithmeticSubtraction &&) = default;
NEArithmeticSubtraction::~NEArithmeticSubtraction() = default;
-Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status NEArithmeticSubtraction::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
return cpu::CpuSub::validate(input1, input2, output, policy, act_info);
}
-void NEArithmeticSubtraction::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void NEArithmeticSubtraction::configure(const ITensor *input1,
+ const ITensor *input2,
+ ITensor *output,
+ ConvertPolicy policy,
+ const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
_impl->src_1 = input2;
diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
index b90a38b47f..d491f0aafc 100644
--- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,29 +29,44 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
namespace arm_compute
{
NEBatchNormalizationLayer::~NEBatchNormalizationLayer() = default;
-NEBatchNormalizationLayer::NEBatchNormalizationLayer()
- : _norm_kernel()
+NEBatchNormalizationLayer::NEBatchNormalizationLayer() : _norm_kernel()
{
}
-void NEBatchNormalizationLayer::configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon,
+void NEBatchNormalizationLayer::configure(ITensor *input,
+ ITensor *output,
+ const ITensor *mean,
+ const ITensor *var,
+ const ITensor *beta,
+ const ITensor *gamma,
+ float epsilon,
ActivationLayerInfo act_info)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info);
// Configure kernel
_norm_kernel = std::make_unique<NEBatchNormalizationLayerKernel>();
_norm_kernel->configure(input, output, mean, var, beta, gamma, epsilon, act_info);
}
-Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta, const ITensorInfo *gamma,
- float epsilon, ActivationLayerInfo act_info)
+Status NEBatchNormalizationLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *mean,
+ const ITensorInfo *var,
+ const ITensorInfo *beta,
+ const ITensorInfo *gamma,
+ float epsilon,
+ ActivationLayerInfo act_info)
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info));
return Status{};
}
diff --git a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
index 8f537a650a..5d711c5ddf 100644
--- a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,31 +28,40 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
namespace arm_compute
{
void NEBatchToSpaceLayer::configure(const ITensor *input, const ITensor *block_shape, ITensor *output)
{
+ ARM_COMPUTE_LOG_PARAMS(input, block_shape, output);
auto k = std::make_unique<NEBatchToSpaceLayerKernel>();
k->configure(input, block_shape, output);
_kernel = std::move(k);
}
-void NEBatchToSpaceLayer::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output)
+void NEBatchToSpaceLayer::configure(
+ const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info)
{
auto k = std::make_unique<NEBatchToSpaceLayerKernel>();
- k->configure(input, block_shape_x, block_shape_y, output);
+ k->configure(input, block_shape_x, block_shape_y, output, crop_info);
_kernel = std::move(k);
}
-Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
{
return NEBatchToSpaceLayerKernel::validate(input, block_shape, output);
}
-Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output)
+Status NEBatchToSpaceLayer::validate(const ITensorInfo *input,
+ int32_t block_shape_x,
+ int32_t block_shape_y,
+ const ITensorInfo *output,
+ const CropInfo &crop_info)
{
- return NEBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output);
+ return NEBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info);
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEBitwiseAnd.cpp b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
index 81c087988a..89ce2087be 100644
--- a/src/runtime/NEON/functions/NEBitwiseAnd.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h"
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEBitwiseAndKernel.h"
#include <utility>
@@ -31,6 +32,7 @@ using namespace arm_compute;
void NEBitwiseAnd::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
{
+ ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
auto k = std::make_unique<NEBitwiseAndKernel>();
k->configure(input1, input2, output);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NEBitwiseNot.cpp b/src/runtime/NEON/functions/NEBitwiseNot.cpp
index 3155df5db3..eda59cd3e9 100644
--- a/src/runtime/NEON/functions/NEBitwiseNot.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseNot.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h"
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEBitwiseNotKernel.h"
#include <utility>
@@ -31,6 +32,7 @@ using namespace arm_compute;
void NEBitwiseNot::configure(const ITensor *input, ITensor *output)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output);
auto k = std::make_unique<NEBitwiseNotKernel>();
k->configure(input, output);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NEBitwiseOr.cpp b/src/runtime/NEON/functions/NEBitwiseOr.cpp
index 793eb25d80..3d6f30b0fe 100644
--- a/src/runtime/NEON/functions/NEBitwiseOr.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseOr.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h"
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEBitwiseOrKernel.h"
#include <utility>
@@ -31,6 +32,7 @@ using namespace arm_compute;
void NEBitwiseOr::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
{
+ ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
auto k = std::make_unique<NEBitwiseOrKernel>();
k->configure(input1, input2, output);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NEBitwiseXor.cpp b/src/runtime/NEON/functions/NEBitwiseXor.cpp
index 2d0af63e35..f0cf3d3e5c 100644
--- a/src/runtime/NEON/functions/NEBitwiseXor.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseXor.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h"
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEBitwiseXorKernel.h"
#include <utility>
@@ -31,6 +32,7 @@ using namespace arm_compute;
void NEBitwiseXor::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
{
+ ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
auto k = std::make_unique<NEBitwiseXorKernel>();
k->configure(input1, input2, output);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
index cfd14faca0..adf891e417 100644
--- a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
+++ b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,19 +22,28 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
namespace arm_compute
{
-void NEBoundingBoxTransform::configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info)
+void NEBoundingBoxTransform::configure(const ITensor *boxes,
+ ITensor *pred_boxes,
+ const ITensor *deltas,
+ const BoundingBoxTransformInfo &info)
{
+ ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info);
// Configure Bounding Box kernel
auto k = std::make_unique<NEBoundingBoxTransformKernel>();
k->configure(boxes, pred_boxes, deltas, info);
_kernel = std::move(k);
}
-Status NEBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status NEBoundingBoxTransform::validate(const ITensorInfo *boxes,
+ const ITensorInfo *pred_boxes,
+ const ITensorInfo *deltas,
+ const BoundingBoxTransformInfo &info)
{
return NEBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info);
}
diff --git a/src/runtime/NEON/functions/NECast.cpp b/src/runtime/NEON/functions/NECast.cpp
index b519576ad5..1fd172a730 100644
--- a/src/runtime/NEON/functions/NECast.cpp
+++ b/src/runtime/NEON/functions/NECast.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,22 +24,23 @@
#include "arm_compute/runtime/NEON/functions/NECast.h"
#include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuCast.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/operators/CpuCast.h"
namespace arm_compute
{
struct NECast::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuCast> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuCast> op{nullptr};
};
-NECast::NECast()
- : _impl(std::make_unique<Impl>())
+NECast::NECast() : _impl(std::make_unique<Impl>())
{
}
-NECast::NECast(NECast &&) = default;
+NECast::NECast(NECast &&) = default;
NECast &NECast::operator=(NECast &&) = default;
NECast::~NECast() = default;
@@ -49,19 +50,19 @@ void NECast::configure(ITensor *input, ITensor *output, ConvertPolicy policy)
_impl->dst = output;
ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst);
-
+ ARM_COMPUTE_LOG_PARAMS(input, output, policy);
_impl->op = std::make_unique<cpu::CpuCast>();
_impl->op->configure(_impl->src->info(), _impl->dst->info(), policy);
}
-Status NECast::validate(ITensorInfo *input, ITensorInfo *output, ConvertPolicy policy)
+Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy)
{
return cpu::CpuCast::validate(input, output, policy);
}
void NECast::run()
{
- ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+ ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
_impl->op->run(pack);
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
index bf4af83a0d..86bee4dd43 100644
--- a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
+++ b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,12 +24,15 @@
#include "arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h"
#include "arm_compute/core/Types.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
namespace arm_compute
{
void NEChannelShuffleLayer::configure(const ITensor *input, ITensor *output, unsigned int num_groups)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, num_groups);
auto k = std::make_unique<NEChannelShuffleLayerKernel>();
k->configure(input, output, num_groups);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
index dcc5cd3a64..59a0892f1f 100644
--- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
@@ -23,33 +23,31 @@
*/
#include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
-#include "src/runtime/cpu/operators/CpuConcatenate.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
#include "arm_compute/core/Error.h"
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
#include "src/core/helpers/AutoConfiguration.h"
+#include "src/cpu/operators/CpuConcatenate.h"
namespace arm_compute
{
struct NEConcatenateLayer::Impl
{
std::vector<const ITensor *> srcs{};
- ITensor *dst{ nullptr };
- unsigned int num_inputs{ 0 };
- unsigned int axis{ 0 };
- std::unique_ptr<cpu::CpuConcatenate> op{ nullptr };
+ ITensor *dst{nullptr};
+ unsigned int num_inputs{0};
+ unsigned int axis{0};
+ std::unique_ptr<cpu::CpuConcatenate> op{nullptr};
};
-NEConcatenateLayer::NEConcatenateLayer()
- : _impl(std::make_unique<Impl>())
+NEConcatenateLayer::NEConcatenateLayer() : _impl(std::make_unique<Impl>())
{
}
-NEConcatenateLayer::NEConcatenateLayer(NEConcatenateLayer &&) = default;
+NEConcatenateLayer::NEConcatenateLayer(NEConcatenateLayer &&) = default;
NEConcatenateLayer &NEConcatenateLayer::operator=(NEConcatenateLayer &&) = default;
NEConcatenateLayer::~NEConcatenateLayer() = default;
@@ -64,7 +62,7 @@ void NEConcatenateLayer::configure(std::vector<const ITensor *> inputs_vector, I
_impl->op = std::make_unique<cpu::CpuConcatenate>();
std::vector<const ITensorInfo *> inputs_vector_info;
- for(unsigned int i = 0; i < inputs_vector.size(); ++i)
+ for (unsigned int i = 0; i < inputs_vector.size(); ++i)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i));
inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
@@ -72,7 +70,9 @@ void NEConcatenateLayer::configure(std::vector<const ITensor *> inputs_vector, I
_impl->op->configure(inputs_vector_info, _impl->dst->info(), axis);
}
-Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector,
+ const ITensorInfo *output,
+ size_t axis)
{
return cpu::CpuConcatenate::validate(inputs_vector, output, axis);
}
@@ -80,7 +80,7 @@ Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inpu
void NEConcatenateLayer::run()
{
ITensorPack pack;
- for(unsigned i = 0; i < _impl->num_inputs; ++i)
+ for (unsigned i = 0; i < _impl->num_inputs; ++i)
{
pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i));
}
diff --git a/src/runtime/NEON/functions/NEConv3D.cpp b/src/runtime/NEON/functions/NEConv3D.cpp
new file mode 100644
index 0000000000..8f41151d6c
--- /dev/null
+++ b/src/runtime/NEON/functions/NEConv3D.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEConv3D.h"
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/operators/CpuDirectConv3d.h"
+
+namespace arm_compute
+{
+using namespace arm_compute::experimental;
+
+struct NEConv3D::Impl
+{
+ std::unique_ptr<cpu::ICpuOperator> op{nullptr};
+ ITensorPack run_pack{};
+};
+
+NEConv3D::NEConv3D() : _impl(std::make_unique<Impl>())
+{
+}
+
+NEConv3D::~NEConv3D() = default;
+
+void NEConv3D::configure(
+ ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info)
+{
+ // Perform validate step
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuDirectConv3d::validate(
+ input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info));
+ ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info);
+
+ auto f = std::make_unique<cpu::CpuDirectConv3d>();
+ f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(),
+ conv_info);
+ _impl->op = std::move(f);
+
+ if (_impl->op != nullptr)
+ {
+ _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+ }
+}
+
+Status NEConv3D::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const Conv3dInfo &conv_info)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuDirectConv3d::validate(input, weights, biases, output, conv_info));
+
+ return Status{};
+}
+
+void NEConv3D::run()
+{
+ if (_impl->op != nullptr)
+ {
+ _impl->op->run(_impl->run_pack);
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
index f2253d8be4..84e8565aaf 100644
--- a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
+++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
@@ -23,24 +23,27 @@
*/
#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
-#include "src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/cpu/operators/CpuConvertFullyConnectedWeights.h"
namespace arm_compute
{
struct NEConvertFullyConnectedWeights::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuConvertFullyConnectedWeights> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuConvertFullyConnectedWeights> op{nullptr};
};
-NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights()
- : _impl(std::make_unique<Impl>())
+NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights() : _impl(std::make_unique<Impl>())
{
}
NEConvertFullyConnectedWeights::~NEConvertFullyConnectedWeights() = default;
-void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape,
- DataLayout data_layout)
+void NEConvertFullyConnectedWeights::configure(const ITensor *input,
+ ITensor *output,
+ const TensorShape &original_input_shape,
+ DataLayout data_layout)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -50,8 +53,10 @@ void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *ou
_impl->op->configure(_impl->src->info(), _impl->dst->info(), original_input_shape, data_layout);
}
-Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
- DataLayout data_layout)
+Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const TensorShape &original_input_shape,
+ DataLayout data_layout)
{
return cpu::CpuConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout);
}
@@ -63,4 +68,4 @@ void NEConvertFullyConnectedWeights::run()
pack.add_tensor(TensorType::ACL_DST, _impl->dst);
_impl->op->run(pack);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index e43d976944..8efebbbb1a 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,235 +25,184 @@
#include "arm_compute/core/PixelValue.h"
#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
-#include <cmath>
-#include <tuple>
-#include <utility>
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuConv2d.h"
+#include "src/cpu/operators/CpuDirectConv2d.h"
+#include "src/cpu/operators/CpuGemmConv2d.h"
+#include "src/cpu/operators/CpuGemmDirectConv2d.h"
+#include "src/cpu/operators/CpuWinogradConv2d.h"
namespace arm_compute
{
-NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) //NOLINT
- : _memory_manager(std::move(memory_manager)),
- _function()
+using namespace arm_compute::experimental;
+
+struct NEConvolutionLayer::Impl
+{
+ MemoryGroup memory_group{};
+ std::shared_ptr<IMemoryManager> memory_manager{};
+ std::unique_ptr<cpu::ICpuOperator> op{nullptr};
+ ITensorPack run_pack{};
+ ITensorPack prep_pack{};
+ WorkspaceData<Tensor> workspace{};
+ experimental::MemoryRequirements aux_mem_req{};
+ std::unique_ptr<IFunction> func{nullptr};
+};
+
+NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
{
+ _impl->memory_manager = std::move(memory_manager);
}
-void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+NEConvolutionLayer::~NEConvolutionLayer() = default;
+
+void NEConvolutionLayer::configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_UNUSED(num_groups);
- ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
- enable_fast_math, num_groups));
+ ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(
+ input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
+ weights_info, dilation, act_info, enable_fast_math, num_groups));
+ ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+ enable_fast_math, num_groups);
const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
- switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math))
+ switch (cpu::CpuConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv_info,
+ weights_info, dilation, act_info, enable_fast_math))
{
case ConvolutionMethod::WINOGRAD:
- {
- auto f = std::make_unique<NEWinogradConvolutionLayer>(_memory_manager);
- f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math);
- _function = std::move(f);
- break;
- }
case ConvolutionMethod::GEMM:
- {
- auto f = std::make_unique<NEGEMMConvolutionLayer>(_memory_manager);
- f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info);
- _function = std::move(f);
- break;
- }
case ConvolutionMethod::GEMM_CONV2D:
- {
- auto f = std::make_unique<NEGEMMConv2d>(_memory_manager);
- f->configure(input, weights, biases, output, info);
- _function = std::move(f);
- break;
- }
case ConvolutionMethod::DIRECT:
{
- auto f = std::make_unique<NEDirectConvolutionLayer>(_memory_manager);
- f->configure(input, weights, biases, output, conv_info, act_info);
- _function = std::move(f);
+ auto f = std::make_unique<cpu::CpuConv2d>();
+ f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr),
+ output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+ _impl->op = std::move(f);
break;
}
case ConvolutionMethod::FFT:
{
- auto f = std::make_unique<NEFFTConvolutionLayer>(_memory_manager);
+ auto f = std::make_unique<NEFFTConvolutionLayer>(_impl->memory_manager);
f->configure(input, weights, biases, output, conv_info, act_info);
- _function = std::move(f);
+ _impl->func = std::move(f);
break;
}
default:
ARM_COMPUTE_ERROR("Not supported.");
break;
}
+
+ if (_impl->op)
+ {
+ _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager));
+ _impl->aux_mem_req = _impl->op->workspace();
+ _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+ _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}};
+ _impl->workspace =
+ manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+ }
}
-Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status NEConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on Neon");
-
const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
- switch(NEConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
+
+ // Biases with dynamic values are not supported with quantized inputs.
+ if (biases)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((!biases->are_values_constant() && is_data_type_quantized(input->data_type())),
+ "Dynamic Biases are not supported with quantized input data.");
+ }
+
+ switch (cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+ enable_fast_math))
{
case ConvolutionMethod::WINOGRAD:
- ARM_COMPUTE_RETURN_ON_ERROR(NEWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
- break;
case ConvolutionMethod::GEMM:
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info));
- break;
case ConvolutionMethod::GEMM_CONV2D:
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMConv2d::validate(input, weights, biases, output, info));
- break;
case ConvolutionMethod::DIRECT:
- ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuConv2d::validate(input, weights, biases, output, conv_info,
+ weights_info, dilation, act_info, enable_fast_math,
+ num_groups));
break;
case ConvolutionMethod::FFT:
- ARM_COMPUTE_RETURN_ON_ERROR(NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEFFTConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
break;
default:
ARM_COMPUTE_ERROR("Not supported.");
break;
}
-
return Status{};
}
-ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights,
- const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
+ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights);
- ARM_COMPUTE_UNUSED(weights_info);
-
- const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
- const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-
- const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, 1);
+ return cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+ enable_fast_math);
+}
- /* Input spatial dims, kernel size, IFM/OFM, conv info*/
- using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo>;
- using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
+void NEConvolutionLayer::run()
+{
+ prepare();
- const std::vector<ConfigurationMethod> known_configs =
- {
- // Alexnet
- ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U)), ConvolutionMethod::GEMM),
- // VGG16 / VGG19
- ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), ConvolutionMethod::GEMM),
- // Mobilenet 224
- ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM),
- // Mobilenet 160
- ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM)
- };
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
- const auto find_config = [&](ConfigurationMethod c)
+ if (_impl->func)
{
- const ConvolutionConfiguration config = c.first;
- const PadStrideInfo info = std::get<3>(config);
-
- return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
- && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
- && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
- };
-
- std::vector<ConfigurationMethod>::const_iterator found;
- if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
+ _impl->func->run();
+ }
+ else
{
- return (*found).second;
+ _impl->op->run(_impl->run_pack);
}
+}
- if(dilation != Size2D(1U, 1U))
+void NEConvolutionLayer::prepare()
+{
+ if (_impl->func)
{
- return ConvolutionMethod::GEMM;
+ _impl->func->prepare();
}
else
{
- // SRGAN
- // Output might not be initialized when it is an internal tensor of the layer using the convolution
- if(input->total_size() > 1e7 && (weights->dimension(idx_h) > 7)
- && (NEDirectConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
- {
- return ConvolutionMethod::DIRECT;
- }
- if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && (NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info)))
- {
- return ConvolutionMethod::FFT;
- }
- if(input->dimension(idx_c) < 16)
- {
- return ConvolutionMethod::GEMM;
- }
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- // This heuristics only applies to F16 data type on A55r1
- if(NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math && input->data_type() == DataType::F16)
- {
- // Exclude known bad winograd configs (and defaults to GEMM)
- const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs =
- {
- // Squeezenet_V1_1 fire2 and fire3
- ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)),
- // Squeezenet_V1_1 fire6 and fire7
- ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U), PadStrideInfo(1U, 1U, 1U, 1U)),
- // Squeezenet_V1_1 fire8 and fire9
- ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U), PadStrideInfo(1U, 1U, 1U, 1U)),
- };
- const auto find_conv_config = [&](ConvolutionConfiguration c)
- {
- const PadStrideInfo info = std::get<3>(c);
-
- return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
- && std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
- && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
- };
-
- bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(), known_bad_winograd_f16_with_fastmath_configs.end(),
- find_conv_config)
- != known_bad_winograd_f16_with_fastmath_configs.end();
- if(found_bad)
- {
- return ConvolutionMethod::GEMM;
- }
- }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- // For 1x1 convolutions run the default GEMM
- if(weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1)
- {
- return ConvolutionMethod::GEMM;
- }
+ _impl->op->prepare(_impl->prep_pack);
- if(bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)))
- {
- return ConvolutionMethod::WINOGRAD;
- }
- if(bool(NEGEMMConv2d::validate(input, weights, nullptr, output, info)))
- {
- return ConvolutionMethod::GEMM_CONV2D;
- }
- return ConvolutionMethod::GEMM;
+ // Release temporary tensors that are only used in prepare stage
+ release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace);
}
}
-
-void NEConvolutionLayer::run()
-{
- prepare();
- _function->run();
-}
-
-void NEConvolutionLayer::prepare()
-{
- _function->prepare();
-}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NECopy.cpp b/src/runtime/NEON/functions/NECopy.cpp
index 20642b5eed..c975d3a5b5 100644
--- a/src/runtime/NEON/functions/NECopy.cpp
+++ b/src/runtime/NEON/functions/NECopy.cpp
@@ -24,7 +24,8 @@
#include "arm_compute/runtime/NEON/functions/NECopy.h"
#include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuCopy.h"
+
+#include "src/cpu/operators/CpuCopy.h"
#include <utility>
@@ -32,16 +33,15 @@ namespace arm_compute
{
struct NECopy::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuCopy> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuCopy> op{nullptr};
};
-NECopy::NECopy()
- : _impl(std::make_unique<Impl>())
+NECopy::NECopy() : _impl(std::make_unique<Impl>())
{
}
-NECopy::NECopy(NECopy &&) = default;
+NECopy::NECopy(NECopy &&) = default;
NECopy &NECopy::operator=(NECopy &&) = default;
NECopy::~NECopy() = default;
diff --git a/src/runtime/NEON/functions/NECropResize.cpp b/src/runtime/NEON/functions/NECropResize.cpp
index 1e1070d961..a94b0882da 100644
--- a/src/runtime/NEON/functions/NECropResize.cpp
+++ b/src/runtime/NEON/functions/NECropResize.cpp
@@ -21,10 +21,12 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
#include "arm_compute/runtime/NEON/functions/NECropResize.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/Tensor.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NECropKernel.h"
#include <cstddef>
@@ -34,18 +36,32 @@ namespace arm_compute
NECropResize::~NECropResize() = default;
NECropResize::NECropResize()
- : _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _crop(), _scale(), _crop_results(), _scaled_results()
+ : _output(nullptr),
+ _num_boxes(0),
+ _method(),
+ _extrapolation_value(0),
+ _crop(),
+ _scale(),
+ _crop_results(),
+ _scaled_results()
{
}
-Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes, const ITensorInfo *box_ind, const ITensorInfo *output,
- Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
+Status NECropResize::validate(const ITensorInfo *input,
+ const ITensorInfo *boxes,
+ const ITensorInfo *box_ind,
+ const ITensorInfo *output,
+ Coordinates2D crop_size,
+ InterpolationPolicy method,
+ float extrapolation_value)
{
ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0);
ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA);
TensorInfo temp_info;
- ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(), box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1, extrapolation_value));
- if(output->total_size() > 0)
+ ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(),
+ box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1,
+ extrapolation_value));
+ if (output->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -55,11 +71,18 @@ Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes
return Status{};
}
-void NECropResize::configure(const ITensor *input, const ITensor *boxes, const ITensor *box_ind, ITensor *output, Coordinates2D crop_size,
- InterpolationPolicy method, float extrapolation_value)
+void NECropResize::configure(const ITensor *input,
+ const ITensor *boxes,
+ const ITensor *box_ind,
+ ITensor *output,
+ Coordinates2D crop_size,
+ InterpolationPolicy method,
+ float extrapolation_value)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
+ ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(),
+ crop_size, method, extrapolation_value));
+ ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value);
_num_boxes = boxes->info()->tensor_shape()[1];
TensorShape out_shape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y);
@@ -79,7 +102,7 @@ void NECropResize::configure(const ITensor *input, const ITensor *boxes, const I
_scaled_results.reserve(_num_boxes);
_scale.reserve(_num_boxes);
- for(unsigned int i = 0; i < _num_boxes; ++i)
+ for (unsigned int i = 0; i < _num_boxes; ++i)
{
auto crop_tensor = std::make_unique<Tensor>();
TensorInfo crop_result_info(1, DataType::F32);
@@ -106,7 +129,7 @@ void NECropResize::run()
{
ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
- for(unsigned int i = 0; i < _num_boxes; ++i)
+ for (unsigned int i = 0; i < _num_boxes; ++i)
{
// Size of the crop box in _boxes and thus the shape of _crop_results[i]
// may not be known until run-time and so the kernels cannot be configured until then.
@@ -115,12 +138,15 @@ void NECropResize::run()
NEScheduler::get().schedule(_crop[i].get(), Window::DimZ);
// Scale the cropped image.
- _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(), ScaleKernelInfo{ _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT, false });
+ _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(),
+ ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value),
+ SamplingPolicy::TOP_LEFT, false});
_scaled_results[i]->allocator()->allocate();
_scale[i]->run();
// Copy scaled image into output.
- std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(), _output->ptr_to_element(Coordinates(0, 0, 0, i)));
+ std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(),
+ _output->ptr_to_element(Coordinates(0, 0, 0, i)));
}
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 5bd61b4074..081c7cc538 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,10 +25,11 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/helpers/AutoConfiguration.h"
using namespace arm_compute::misc::shape_calculator;
@@ -61,9 +62,9 @@ PadStrideInfo compute_upsample_info(const PadStrideInfo &info, uint32_t deconv_p
deconv_pad_top += deconv_pad_y / 2;
deconv_pad_bottom += deconv_pad_y / 2;
- return PadStrideInfo(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR);
+ return PadStrideInfo(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom,
+ DimensionRoundingType::FLOOR);
}
-
} // namespace
NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
@@ -77,20 +78,29 @@ NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memor
_original_weights(nullptr),
_input(nullptr),
_info(),
- _is_prepared(false)
+ _is_prepared(false),
+ _do_upsampling(true)
{
}
-Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info)
+Status NEDeconvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const PadStrideInfo &info,
+ bool enable_fast_math,
+ const WeightsInfo &weights_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- const unsigned int width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+ DataType::QASYMM8_SIGNED);
+ const unsigned int width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+ const unsigned int height_idx =
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(height_idx) < 1);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input);
- if(is_data_type_quantized_per_channel(weights->data_type()) && is_data_type_quantized(input->data_type()))
+ if (is_data_type_quantized_per_channel(weights->data_type()) && is_data_type_quantized(input->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
}
@@ -99,11 +109,23 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
}
- auto out_dims = deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), weights->dimension(height_idx), info);
+ const unsigned int pad_left = info.pad_left();
+ const unsigned int pad_top = info.pad_top();
+ const unsigned int pad_right = info.pad_right();
+ const unsigned int pad_bottom = info.pad_bottom();
+
+ ARM_COMPUTE_RETURN_ERROR_ON(((input->dimension(width_idx) - 1) * info.stride().first +
+ weights->dimension(width_idx)) < (pad_left + pad_right));
+ ARM_COMPUTE_RETURN_ERROR_ON(((input->dimension(height_idx) - 1) * info.stride().second +
+ weights->dimension(height_idx)) < (pad_top + pad_bottom));
- if(bias != nullptr)
+ auto out_dims =
+ deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx),
+ weights->dimension(width_idx), weights->dimension(height_idx), info);
+
+ if (bias != nullptr)
{
- if(is_data_type_quantized_asymmetric(input->data_type()))
+ if (is_data_type_quantized_asymmetric(input->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
}
@@ -113,57 +135,84 @@ Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInf
}
}
- if(output->tensor_shape().total_size() > 0)
+ if (output->tensor_shape().total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(),
+ "Output's width is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(),
+ "Output's height is invalid.");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(),
+ "Output's depth is invalid.");
}
- uint32_t deconv_pad_x = 0;
- uint32_t deconv_pad_y = 0;
- const unsigned int stride_x = info.stride().first;
- const unsigned int stride_y = info.stride().second;
- // Guard against overflows in compute_deconvolution_upsampled_shape()
- const DataLayout data_layout = input->data_layout();
- const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const unsigned int out_x = (input->dimension(idx_w) - 1) * stride_x + 1;
- const unsigned int out_y = (input->dimension(idx_h) - 1) * stride_y + 1;
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) > out_x);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) > out_y);
- ARM_COMPUTE_RETURN_ERROR_ON((out_x - weights->dimension(idx_w) + 1) > out_dims.first);
- ARM_COMPUTE_RETURN_ERROR_ON((out_y - weights->dimension(idx_h) + 1) > out_dims.second);
-
- const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
- TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
- const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-
- const unsigned int batches_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
- const unsigned int channel_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
+ uint32_t deconv_pad_x = 0;
+ uint32_t deconv_pad_y = 0;
+ const uint32_t stride_x = info.stride().first;
+ const uint32_t stride_y = info.stride().second;
+ const auto deconv_padding = compute_deconvolution_padding(*input, *weights, static_cast<int32_t>(stride_x),
+ static_cast<int32_t>(stride_y), out_dims);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(deconv_padding.first < 0 || deconv_padding.second < 0,
+ "Negative padding not supported");
+
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y,
+ out_dims, deconv_pad_x, deconv_pad_y);
+ TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
+ const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y);
+
+ // Do not perform upsampling when the operation uses unit stride in all dimensions
+ const bool do_upsampling = stride_x != 1 || stride_y != 1;
+
+ const unsigned int batches_idx =
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
+ const unsigned int channel_idx =
+ get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) != scale_out_info.dimension(batches_idx));
ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != scale_out_info.dimension(channel_idx));
- ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, WeightsInfo()));
+ if (do_upsampling)
+ {
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info,
+ weights_info, Size2D(1U, 1U), ActivationLayerInfo(),
+ enable_fast_math));
+ }
+ else
+ {
+ const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(),
+ upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(input, weights, bias, output, conv_info, weights_info,
+ Size2D(1U, 1U), ActivationLayerInfo(),
+ enable_fast_math));
+ }
return Status{};
}
-void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info)
+void NEDeconvolutionLayer::configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *bias,
+ ITensor *output,
+ const PadStrideInfo &info,
+ bool enable_fast_math,
+ const WeightsInfo &weights_info)
{
// Perform validation step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info));
+ ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(),
+ (bias == nullptr) ? nullptr : bias->info(),
+ output->info(), info, enable_fast_math, weights_info));
+ ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, enable_fast_math, weights_info);
const DataLayout data_layout = input->info()->data_layout();
const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- auto out_dims = deconvolution_output_dimensions(input->info()->dimension(width_idx), input->info()->dimension(height_idx),
- weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info);
+ auto out_dims = deconvolution_output_dimensions(
+ input->info()->dimension(width_idx), input->info()->dimension(height_idx),
+ weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info);
const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
@@ -176,32 +225,24 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con
const unsigned int stride_y = info.stride().second;
// Output auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+ input->info()->quantization_info());
_flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
- _memory_group.manage(&_scaled_output);
_weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
_flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
// setup the function to convolve the upscaled output
- const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
- uint32_t deconv_pad_x = 0;
- uint32_t deconv_pad_y = 0;
-
- const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(),
- stride_x, stride_y,
- out_dims, deconv_pad_x, deconv_pad_y);
+ uint32_t deconv_pad_x = 0;
+ uint32_t deconv_pad_y = 0;
+ const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(
+ *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y);
- TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
- scale_out_info.set_data_layout(data_layout);
- _scaled_output.allocator()->init(scale_out_info);
-
- _upsample_f.configure(input, &_scaled_output, upsample_info);
-
- _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
+ // Do not perform upsampling when the operation uses unit stride in all dimensions
+ _do_upsampling = stride_x != 1 || stride_y != 1;
// Setup flip axis data
_flip_axis.allocator()->allocate();
@@ -209,7 +250,32 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con
axis_data[0] = static_cast<uint32_t>(width_idx);
axis_data[1] = static_cast<uint32_t>(height_idx);
- _scaled_output.allocator()->allocate();
+ // Setup convolution and upsampling, if needed
+ if (_do_upsampling)
+ {
+ _memory_group.manage(&_scaled_output);
+
+ const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+ TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+ scale_out_info.set_data_layout(data_layout);
+ _scaled_output.allocator()->init(scale_out_info);
+
+ // Minor optimization: In the upsampling step, we do not need to allocate space for the padding in the upsampled image.
+ // The padding amount can be given as input to the convolution layer.
+ _upsample_f.configure(input, &_scaled_output, upsample_info);
+
+ _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U),
+ ActivationLayerInfo(), enable_fast_math);
+
+ _scaled_output.allocator()->allocate();
+ }
+ else
+ {
+ const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(),
+ upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL);
+ _conv_f.configure(input, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U),
+ ActivationLayerInfo(), enable_fast_math);
+ }
}
void NEDeconvolutionLayer::run()
@@ -218,13 +284,16 @@ void NEDeconvolutionLayer::run()
MemoryGroupResourceScope scope_mg(_memory_group);
- _upsample_f.run();
+ if (_do_upsampling)
+ {
+ _upsample_f.run();
+ }
_conv_f.run();
}
void NEDeconvolutionLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
diff --git a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
index 07e985c25e..766635dfa1 100644
--- a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
@@ -24,7 +24,8 @@
#include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
#include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuCast.h"
+
+#include "src/cpu/operators/CpuCast.h"
#include <utility>
@@ -32,16 +33,15 @@ namespace arm_compute
{
struct NEDepthConvertLayer::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuCast> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuCast> op{nullptr};
};
-NEDepthConvertLayer::NEDepthConvertLayer()
- : _impl(std::make_unique<Impl>())
+NEDepthConvertLayer::NEDepthConvertLayer() : _impl(std::make_unique<Impl>())
{
}
-NEDepthConvertLayer::NEDepthConvertLayer(NEDepthConvertLayer &&) = default;
+NEDepthConvertLayer::NEDepthConvertLayer(NEDepthConvertLayer &&) = default;
NEDepthConvertLayer &NEDepthConvertLayer::operator=(NEDepthConvertLayer &&) = default;
NEDepthConvertLayer::~NEDepthConvertLayer() = default;
@@ -59,7 +59,8 @@ void NEDepthConvertLayer::configure(const ITensor *input, ITensor *output, Conve
_impl->op->configure(_impl->src->info(), _impl->dst->info(), policy);
}
-Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
+Status
+NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
{
ARM_COMPUTE_RETURN_ERROR_ON(shift != 0);
return cpu::CpuCast::validate(input, output, policy);
@@ -67,7 +68,7 @@ Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo
void NEDepthConvertLayer::run()
{
- ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+ ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
_impl->op->run(pack);
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
index 2793c3f27e..5eea4dca65 100644
--- a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,15 +25,24 @@
#include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
namespace arm_compute
{
+NEDepthToSpaceLayer::NEDepthToSpaceLayer() : _kernel{}
+{
+}
+
+NEDepthToSpaceLayer::~NEDepthToSpaceLayer() = default;
+
void NEDepthToSpaceLayer::configure(const ITensor *input, ITensor *output, int32_t block_shape)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, block_shape);
+
auto k = std::make_unique<NEDepthToSpaceLayerKernel>();
k->configure(input, output, block_shape);
_kernel = std::move(k);
@@ -43,4 +52,10 @@ Status NEDepthToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo
{
return NEDepthToSpaceLayerKernel::validate(input, output, block_shape);
}
+
+void NEDepthToSpaceLayer::run()
+{
+ NEScheduler::get().schedule(_kernel.get(), _kernel->get_split_dimension());
+}
+
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index a561b88058..6c085645db 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,7 +27,9 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/runtime/cpu/operators/CpuDepthwiseConv2d.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/operators/CpuDepthwiseConv2d.h"
using namespace arm_compute::misc;
using namespace arm_compute::misc::shape_calculator;
@@ -38,38 +40,35 @@ NEDepthwiseConvolutionLayer::~NEDepthwiseConvolutionLayer() = default;
struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::Impl
{
- ITensor *src{ nullptr }; // SRC_0
- ITensor *dst{ nullptr }; // DST_0
- const ITensor *weights
- {
- nullptr
- }; // SRC_1
- const ITensor *biases
- {
- nullptr
- }; // SRC_2
+ ITensor *src{nullptr}; // SRC_0
+ ITensor *dst{nullptr}; // DST_0
+ const ITensor *weights{nullptr}; // SRC_1
+ const ITensor *biases{nullptr}; // SRC_2
Tensor permuted_input{}; // INT_0
Tensor permuted_weights{}; // INT_1
Tensor permuted_output{}; // INT_2
Tensor workspace{}; // INT_3
Tensor packed_weights{}; // INT_4
- std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
- bool is_prepared{ false };
- bool permute{ false };
+ std::shared_ptr<cpu::CpuDepthwiseConv2d> op{nullptr};
+ bool is_prepared{false};
+ bool permute{false};
};
-NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(
+ std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(memory_manager), _impl(std::make_unique<Impl>())
{
}
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor *input,
- const ITensor *weights,
- const ITensor *biases,
- ITensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier,
- const ActivationLayerInfo &act_info,
- const Size2D &dilation)
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(
+ ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
@@ -81,9 +80,9 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
_impl->permute = is_nhwc;
_impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
- ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
- _impl->op->configure(_impl->src->info(), _impl->weights->info(), _impl->biases == nullptr ? nullptr : _impl->biases->info(),
- _impl->dst->info(), info);
+ ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
+ _impl->op->configure(_impl->src->info(), _impl->weights->info(),
+ _impl->biases == nullptr ? nullptr : _impl->biases->info(), _impl->dst->info(), info);
// Configure pipeline
ActivationLayerInfo act_info_to_use = ActivationLayerInfo();
@@ -91,15 +90,15 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(act_info);
bool is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6);
- if(!is_activationlayer_enabled)
+ if (!is_activationlayer_enabled)
{
act_info_to_use = act_info;
}
- info = ConvolutionInfo{ conv_info, depth_multiplier, act_info_to_use, dilation };
+ info = ConvolutionInfo{conv_info, depth_multiplier, act_info_to_use, dilation};
auto dwc_optimized_func = std::make_unique<cpu::CpuDepthwiseConv2dAssemblyDispatch>();
- if(is_nhwc)
+ if (is_nhwc)
{
auto permute_input = std::make_unique<cpu::CpuPermute>();
auto permute_weights = std::make_unique<cpu::CpuPermute>();
@@ -121,7 +120,9 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
_impl->permuted_output.info()->set_quantization_info(output->info()->quantization_info());
// Configure optimized depthwise
- dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(), biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(), info);
+ dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(),
+ biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(),
+ info);
// Configure the function to transform the convoluted output to ACL's native ordering format NCHW
_impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
@@ -132,28 +133,33 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
}
else
{
- dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(), biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info);
+ dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(),
+ biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info);
}
// Allocate memory based on the internal memory requirements
experimental::MemoryRequirements mem_req = dwc_optimized_func->workspace();
- _impl->workspace.allocator()->init(TensorInfo(TensorShape{ mem_req[0].size }, 1, DataType::S8), mem_req[0].alignment);
- _impl->packed_weights.allocator()->init(TensorInfo(TensorShape{ mem_req[1].size }, 1, DataType::S8), mem_req[1].alignment);
-
+ _impl->workspace.allocator()->init(TensorInfo(TensorShape{mem_req[0].size + mem_req[0].alignment}, 1, DataType::S8),
+ mem_req[0].alignment);
+ _impl->packed_weights.allocator()->init(
+ TensorInfo(TensorShape{mem_req[1].size + mem_req[1].alignment}, 1, DataType::S8), mem_req[1].alignment);
+ _memory_group.manage(&_impl->workspace);
+ _memory_group.manage(&_impl->packed_weights);
_impl->workspace.allocator()->allocate();
_impl->packed_weights.allocator()->allocate();
}
-Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo *input,
- const ITensorInfo *weights,
- const ITensorInfo *biases,
- const ITensorInfo *output,
- const PadStrideInfo &conv_info,
- unsigned int depth_multiplier,
- const ActivationLayerInfo &act_info,
- const Size2D &dilation)
+Status
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
{
- ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+ ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
}
@@ -178,15 +184,15 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::
void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::prepare()
{
- if(!_impl->is_prepared)
+ if (!_impl->is_prepared)
{
// Permute weights
- if(_impl->permute)
+ if (_impl->permute)
{
_impl->permuted_weights.allocator()->allocate();
}
- if(!_impl->permuted_weights.is_used())
+ if (!_impl->permuted_weights.is_used())
{
_impl->permuted_weights.allocator()->free();
}
@@ -200,14 +206,14 @@ struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::Impl
Tensor permuted_input{};
Tensor permuted_weights{};
Tensor permuted_output{};
- bool is_prepared{ false };
- bool is_nchw{ false };
- bool is_activationlayer_enabled{ false };
- const ITensor *weights{ nullptr };
- const ITensor *biases{ nullptr };
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
+ bool is_prepared{false};
+ bool is_nchw{false};
+ bool is_activationlayer_enabled{false};
+ const ITensor *weights{nullptr};
+ const ITensor *biases{nullptr};
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::shared_ptr<cpu::CpuDepthwiseConv2d> op{nullptr};
};
NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric()
@@ -215,16 +221,21 @@ NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConv
{
}
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
- output->info(), conv_info, depth_multiplier, act_info, dilation));
- const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+ const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
_impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
- _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), info);
+ _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(),
+ info);
_impl->src = input;
_impl->dst = output;
@@ -236,7 +247,7 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
ITensor *input_to_use = input;
const ITensor *weights_to_use = weights;
ITensor *output_to_use = output;
- if(_impl->is_nchw)
+ if (_impl->is_nchw)
{
auto permute_input = std::make_unique<cpu::CpuPermute>();
auto permute_weights = std::make_unique<cpu::CpuPermute>();
@@ -249,14 +260,16 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
_impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);
weights_to_use = &_impl->permuted_weights;
- _impl->permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+ _impl->permuted_output.allocator()->init(
+ output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
output_to_use = &_impl->permuted_output;
}
auto depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
- depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info);
+ depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(),
+ biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info);
- if(_impl->is_nchw)
+ if (_impl->is_nchw)
{
auto permute_output = std::make_unique<cpu::CpuPermute>();
permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));
@@ -268,11 +281,16 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
}
}
-Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
{
- ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+ ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
}
@@ -298,43 +316,64 @@ NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer(std::shared_ptr<IMemory
#ifndef DOXYGEN_SKIP_THIS
struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer::Impl
{
- DepthwiseConvolutionFunction depth_conv_func{ DepthwiseConvolutionFunction::OPTIMIZED };
- NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{ nullptr };
+ DepthwiseConvolutionFunction depth_conv_func{DepthwiseConvolutionFunction::OPTIMIZED};
+ NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{nullptr};
NEDepthwiseConvolutionLayerGeneric func_generic{};
- std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
+ std::shared_ptr<cpu::CpuDepthwiseConv2d> op{nullptr};
};
#endif // DOXYGEN_SKIP_THIS
-void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
- const ActivationLayerInfo &act_info, const Size2D &dilation)
+void NEDepthwiseConvolutionLayer::configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
{
- const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+ ARM_COMPUTE_LOG_PARAMS(input, weights, output, conv_info, depth_multiplier, biases, act_info, dilation);
+ ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(
+ input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), output->info(), conv_info,
+ depth_multiplier, act_info, dilation));
+
+ const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
_impl->op = std::make_shared<cpu::CpuDepthwiseConv2d>();
- _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
- info);
- switch(_impl->depth_conv_func)
+ _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(
+ input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), info);
+ switch (_impl->depth_conv_func)
{
case DepthwiseConvolutionFunction::OPTIMIZED:
- _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info,
+ dilation);
break;
case DepthwiseConvolutionFunction::GENERIC:
- _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+ _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info,
+ dilation);
break;
default:
ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
}
}
-Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ unsigned int depth_multiplier,
+ const ActivationLayerInfo &act_info,
+ const Size2D &dilation)
{
- ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+ ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
}
void NEDepthwiseConvolutionLayer::run()
{
- switch(_impl->depth_conv_func)
+ switch (_impl->depth_conv_func)
{
case DepthwiseConvolutionFunction::OPTIMIZED:
_impl->func_optimized.run();
@@ -349,7 +388,7 @@ void NEDepthwiseConvolutionLayer::run()
void NEDepthwiseConvolutionLayer::prepare()
{
- switch(_impl->depth_conv_func)
+ switch (_impl->depth_conv_func)
{
case DepthwiseConvolutionFunction::OPTIMIZED:
_impl->func_optimized.prepare();
diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
index 91e37594af..28d19d2950 100644
--- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
@@ -26,19 +26,19 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/Tensor.h"
-#include "src/runtime/cpu/operators/CpuDequantize.h"
+
+#include "src/cpu/operators/CpuDequantize.h"
namespace arm_compute
{
struct NEDequantizationLayer::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuDequantize> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuDequantize> op{nullptr};
};
-NEDequantizationLayer::NEDequantizationLayer()
- : _impl(std::make_unique<Impl>())
+NEDequantizationLayer::NEDequantizationLayer() : _impl(std::make_unique<Impl>())
{
}
NEDequantizationLayer::~NEDequantizationLayer() = default;
diff --git a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp
index 9e63800728..b347390162 100644
--- a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp
+++ b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,6 +27,8 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/Validate.h"
+#include "src/common/utils/Log.h"
+
#include <cstddef>
#include <ios>
#include <list>
@@ -34,23 +36,36 @@
namespace arm_compute
{
NEDetectionPostProcessLayer::NEDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _dequantize(), _detection_post_process(), _decoded_scores(), _run_dequantize(false)
+ : _memory_group(std::move(memory_manager)),
+ _dequantize(),
+ _detection_post_process(),
+ _decoded_scores(),
+ _run_dequantize(false)
{
}
-void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores, const ITensor *input_anchors,
- ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info)
+void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding,
+ const ITensor *input_scores,
+ const ITensor *input_anchors,
+ ITensor *output_boxes,
+ ITensor *output_classes,
+ ITensor *output_scores,
+ ITensor *num_detection,
+ DetectionPostProcessLayerInfo info)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores);
- ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(),
- output_scores->info(),
- num_detection->info(), info));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes,
+ output_scores);
+ ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate(
+ input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(),
+ output_classes->info(), output_scores->info(), num_detection->info(), info));
+ ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores,
+ num_detection, info);
const ITensor *input_scores_to_use = input_scores;
DetectionPostProcessLayerInfo info_to_use = info;
_run_dequantize = is_data_type_quantized(input_box_encoding->info()->data_type());
- if(_run_dequantize)
+ if (_run_dequantize)
{
_memory_group.manage(&_decoded_scores);
@@ -59,26 +74,37 @@ void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, c
input_scores_to_use = &_decoded_scores;
// Create a new info struct to avoid dequantizing in the CPP layer
- std::array<float, 4> scales_values{ info.scale_value_y(), info.scale_value_x(), info.scale_value_h(), info.scale_value_w() };
- DetectionPostProcessLayerInfo info_quantized(info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(), info.num_classes(),
- scales_values, info.use_regular_nms(), info.detection_per_class(), false);
+ std::array<float, 4> scales_values{info.scale_value_y(), info.scale_value_x(), info.scale_value_h(),
+ info.scale_value_w()};
+ DetectionPostProcessLayerInfo info_quantized(
+ info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(),
+ info.num_classes(), scales_values, info.use_regular_nms(), info.detection_per_class(), false);
info_to_use = info_quantized;
}
- _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes, output_classes, output_scores, num_detection, info_to_use);
+ _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes,
+ output_classes, output_scores, num_detection, info_to_use);
_decoded_scores.allocator()->allocate();
}
-Status NEDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_scores, const ITensorInfo *input_anchors,
- ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info)
+Status NEDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding,
+ const ITensorInfo *input_scores,
+ const ITensorInfo *input_anchors,
+ ITensorInfo *output_boxes,
+ ITensorInfo *output_classes,
+ ITensorInfo *output_scores,
+ ITensorInfo *num_detection,
+ DetectionPostProcessLayerInfo info)
{
bool run_dequantize = is_data_type_quantized(input_box_encoding->data_type());
- if(run_dequantize)
+ if (run_dequantize)
{
TensorInfo decoded_classes_info = input_scores->clone()->set_is_resizable(true).set_data_type(DataType::F32);
ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(input_scores, &decoded_classes_info));
}
- ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, num_detection, info));
+ ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors,
+ output_boxes, output_classes, output_scores,
+ num_detection, info));
return Status{};
}
@@ -88,7 +114,7 @@ void NEDetectionPostProcessLayer::run()
MemoryGroupResourceScope scope_mg(_memory_group);
// Decode scores if necessary
- if(_run_dequantize)
+ if (_run_dequantize)
{
_dequantize.run();
}
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 58530e4a8f..f1c2cf969f 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -27,17 +27,18 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/runtime/cpu/operators/CpuDirectConv2d.h"
+
+#include "src/cpu/operators/CpuDirectConv2d.h"
namespace arm_compute
{
struct NEDirectConvolutionLayer::Impl
{
- ITensor *src{ nullptr };
- const ITensor *weights{ nullptr };
- const ITensor *bias{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuDirectConv2d> op{ nullptr };
+ ITensor *src{nullptr};
+ const ITensor *weights{nullptr};
+ const ITensor *bias{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuDirectConv2d> op{nullptr};
};
NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -46,17 +47,27 @@ NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManage
}
NEDirectConvolutionLayer::~NEDirectConvolutionLayer() = default;
-void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void NEDirectConvolutionLayer::configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *bias,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info)
{
_impl->src = input;
_impl->weights = weights;
_impl->bias = bias;
_impl->dst = output;
_impl->op = std::make_unique<cpu::CpuDirectConv2d>(_memory_manager);
- _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), conv_info, act_info);
+ _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(),
+ conv_info, act_info);
}
-Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info,
+Status NEDirectConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
const ActivationLayerInfo &act_info)
{
return cpu::CpuDirectConv2d::validate(input, weights, bias, output, conv_info, act_info);
diff --git a/src/runtime/NEON/functions/NEElementwiseOperations.cpp b/src/runtime/NEON/functions/NEElementwiseOperations.cpp
index 946bbb24b8..685ef2d4d7 100644
--- a/src/runtime/NEON/functions/NEElementwiseOperations.cpp
+++ b/src/runtime/NEON/functions/NEElementwiseOperations.cpp
@@ -22,10 +22,11 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h"
-#include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuElementwise.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/cpu/operators/CpuElementwise.h"
#include <utility>
@@ -33,17 +34,16 @@ namespace arm_compute
{
struct NEElementwiseMax::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuElementwiseMax> op{ nullptr };
+ const ITensor *src_0{nullptr};
+ const ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuElementwiseMax> op{nullptr};
};
-NEElementwiseMax::NEElementwiseMax()
- : _impl(std::make_unique<Impl>())
+NEElementwiseMax::NEElementwiseMax() : _impl(std::make_unique<Impl>())
{
}
-NEElementwiseMax::NEElementwiseMax(NEElementwiseMax &&) = default;
+NEElementwiseMax::NEElementwiseMax(NEElementwiseMax &&) = default;
NEElementwiseMax &NEElementwiseMax::operator=(NEElementwiseMax &&) = default;
NEElementwiseMax::~NEElementwiseMax() = default;
@@ -57,7 +57,10 @@ void NEElementwiseMax::configure(ITensor *input1, ITensor *input2, ITensor *outp
_impl->op->configure(input1->info(), input2->info(), output->info());
}
-Status NEElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseMax::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
return cpu::CpuElementwiseMax::validate(input1, input2, output);
@@ -74,17 +77,16 @@ void NEElementwiseMax::run()
struct NEElementwiseMin::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuElementwiseMin> op{ nullptr };
+ const ITensor *src_0{nullptr};
+ const ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuElementwiseMin> op{nullptr};
};
-NEElementwiseMin::NEElementwiseMin()
- : _impl(std::make_unique<Impl>())
+NEElementwiseMin::NEElementwiseMin() : _impl(std::make_unique<Impl>())
{
}
-NEElementwiseMin::NEElementwiseMin(NEElementwiseMin &&) = default;
+NEElementwiseMin::NEElementwiseMin(NEElementwiseMin &&) = default;
NEElementwiseMin &NEElementwiseMin::operator=(NEElementwiseMin &&) = default;
NEElementwiseMin::~NEElementwiseMin() = default;
@@ -98,7 +100,10 @@ void NEElementwiseMin::configure(ITensor *input1, ITensor *input2, ITensor *outp
_impl->op->configure(input1->info(), input2->info(), output->info());
}
-Status NEElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseMin::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
return cpu::CpuElementwiseMin::validate(input1, input2, output);
@@ -115,21 +120,23 @@ void NEElementwiseMin::run()
struct NEElementwiseSquaredDiff::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuElementwiseSquaredDiff> op{ nullptr };
+ const ITensor *src_0{nullptr};
+ const ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuElementwiseSquaredDiff> op{nullptr};
};
-NEElementwiseSquaredDiff::NEElementwiseSquaredDiff()
- : _impl(std::make_unique<Impl>())
+NEElementwiseSquaredDiff::NEElementwiseSquaredDiff() : _impl(std::make_unique<Impl>())
{
}
-NEElementwiseSquaredDiff::NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&) = default;
+NEElementwiseSquaredDiff::NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&) = default;
NEElementwiseSquaredDiff &NEElementwiseSquaredDiff::operator=(NEElementwiseSquaredDiff &&) = default;
NEElementwiseSquaredDiff::~NEElementwiseSquaredDiff() = default;
-void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEElementwiseSquaredDiff::configure(ITensor *input1,
+ ITensor *input2,
+ ITensor *output,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(act_info);
_impl->src_0 = input1;
@@ -139,7 +146,10 @@ void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITens
_impl->op->configure(input1->info(), input2->info(), output->info());
}
-Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
return cpu::CpuElementwiseSquaredDiff::validate(input1, input2, output);
@@ -156,21 +166,23 @@ void NEElementwiseSquaredDiff::run()
struct NEElementwiseDivision::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuElementwiseDivision> op{ nullptr };
+ const ITensor *src_0{nullptr};
+ const ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuElementwiseDivision> op{nullptr};
};
-NEElementwiseDivision::NEElementwiseDivision()
- : _impl(std::make_unique<Impl>())
+NEElementwiseDivision::NEElementwiseDivision() : _impl(std::make_unique<Impl>())
{
}
-NEElementwiseDivision::NEElementwiseDivision(NEElementwiseDivision &&) = default;
+NEElementwiseDivision::NEElementwiseDivision(NEElementwiseDivision &&) = default;
NEElementwiseDivision &NEElementwiseDivision::operator=(NEElementwiseDivision &&) = default;
NEElementwiseDivision::~NEElementwiseDivision() = default;
-void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEElementwiseDivision::configure(ITensor *input1,
+ ITensor *input2,
+ ITensor *output,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(act_info);
_impl->src_0 = input1;
@@ -180,7 +192,10 @@ void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor
_impl->op->configure(input1->info(), input2->info(), output->info());
}
-Status NEElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseDivision::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
return cpu::CpuElementwiseDivision::validate(input1, input2, output);
@@ -197,21 +212,23 @@ void NEElementwiseDivision::run()
struct NEElementwisePower::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuElementwisePower> op{ nullptr };
+ const ITensor *src_0{nullptr};
+ const ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuElementwisePower> op{nullptr};
};
-NEElementwisePower::NEElementwisePower()
- : _impl(std::make_unique<Impl>())
+NEElementwisePower::NEElementwisePower() : _impl(std::make_unique<Impl>())
{
}
-NEElementwisePower::NEElementwisePower(NEElementwisePower &&) = default;
+NEElementwisePower::NEElementwisePower(NEElementwisePower &&) = default;
NEElementwisePower &NEElementwisePower::operator=(NEElementwisePower &&) = default;
NEElementwisePower::~NEElementwisePower() = default;
-void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEElementwisePower::configure(ITensor *input1,
+ ITensor *input2,
+ ITensor *output,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_UNUSED(act_info);
_impl->src_0 = input1;
@@ -221,7 +238,10 @@ void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *ou
_impl->op->configure(input1->info(), input2->info(), output->info());
}
-Status NEElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwisePower::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
return cpu::CpuElementwisePower::validate(input1, input2, output);
@@ -239,22 +259,22 @@ void NEElementwisePower::run()
template <ComparisonOperation COP>
struct NEElementwiseComparisonStatic<COP>::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuElementwiseComparisonStatic<COP>> op{ nullptr };
+ const ITensor *src_0{nullptr};
+ const ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuElementwiseComparisonStatic<COP>> op{nullptr};
};
template <ComparisonOperation COP>
-NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic()
- : _impl(std::make_unique<Impl>())
+NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic() : _impl(std::make_unique<Impl>())
{
}
template <ComparisonOperation COP>
NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic(NEElementwiseComparisonStatic &&) = default;
-template <ComparisonOperation COP>
-NEElementwiseComparisonStatic<COP> &NEElementwiseComparisonStatic<COP>::operator=(NEElementwiseComparisonStatic &&) = default;
-template <ComparisonOperation COP>
+template <ComparisonOperation COP>
+NEElementwiseComparisonStatic<COP> &
+NEElementwiseComparisonStatic<COP>::operator=(NEElementwiseComparisonStatic &&) = default;
+template <ComparisonOperation COP>
NEElementwiseComparisonStatic<COP>::~NEElementwiseComparisonStatic() = default;
template <ComparisonOperation COP>
@@ -268,13 +288,15 @@ void NEElementwiseComparisonStatic<COP>::configure(ITensor *input1, ITensor *inp
}
template <ComparisonOperation COP>
-Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output)
{
return cpu::CpuElementwiseComparisonStatic<COP>::validate(input1, input2, output);
}
template <ComparisonOperation COP>
-void NEElementwiseComparisonStatic<COP>::run()
+void NEElementwiseComparisonStatic<COP>::run()
{
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
@@ -285,17 +307,16 @@ void NEElementwiseComparisonStatic<COP>::run()
struct NEElementwiseComparison::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuElementwiseComparison> op{ nullptr };
+ const ITensor *src_0{nullptr};
+ const ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuElementwiseComparison> op{nullptr};
};
-NEElementwiseComparison::NEElementwiseComparison()
- : _impl(std::make_unique<Impl>())
+NEElementwiseComparison::NEElementwiseComparison() : _impl(std::make_unique<Impl>())
{
}
-NEElementwiseComparison::NEElementwiseComparison(NEElementwiseComparison &&) = default;
+NEElementwiseComparison::NEElementwiseComparison(NEElementwiseComparison &&) = default;
NEElementwiseComparison &NEElementwiseComparison::operator=(NEElementwiseComparison &&) = default;
NEElementwiseComparison::~NEElementwiseComparison() = default;
@@ -308,7 +329,10 @@ void NEElementwiseComparison::configure(ITensor *input1, ITensor *input2, ITenso
_impl->op->configure(input1->info(), input2->info(), output->info(), op);
}
-Status NEElementwiseComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op)
+Status NEElementwiseComparison::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ ComparisonOperation op)
{
return cpu::CpuElementwiseComparison::validate(input1, input2, output, op);
}
diff --git a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
index 1a9e8839ca..23a092c407 100644
--- a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
+++ b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
@@ -22,7 +22,9 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h"
-#include "src/runtime/cpu/operators/CpuElementwiseUnary.h"
+
+#include "src/cpu/operators/CpuElementwiseUnary.h"
+
#include <utility>
namespace arm_compute
@@ -32,21 +34,20 @@ using OperatorType = cpu::CpuElementwiseUnary;
template <ElementWiseUnary op>
struct NEElementwiseUnaryLayer<op>::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<OperatorType> cpu_op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<OperatorType> cpu_op{nullptr};
};
template <ElementWiseUnary op>
-NEElementwiseUnaryLayer<op>::NEElementwiseUnaryLayer()
- : _impl(std::make_unique<Impl>())
+NEElementwiseUnaryLayer<op>::NEElementwiseUnaryLayer() : _impl(std::make_unique<Impl>())
{
}
template <ElementWiseUnary op>
NEElementwiseUnaryLayer<op>::~NEElementwiseUnaryLayer() = default;
template <ElementWiseUnary op>
NEElementwiseUnaryLayer<op>::NEElementwiseUnaryLayer(NEElementwiseUnaryLayer &&) = default;
-template <ElementWiseUnary op>
+template <ElementWiseUnary op>
NEElementwiseUnaryLayer<op> &NEElementwiseUnaryLayer<op>::operator=(NEElementwiseUnaryLayer &&) = default;
template <ElementWiseUnary op>
@@ -65,7 +66,7 @@ Status NEElementwiseUnaryLayer<op>::validate(const ITensorInfo *input, const ITe
}
template <ElementWiseUnary op>
-void NEElementwiseUnaryLayer<op>::run()
+void NEElementwiseUnaryLayer<op>::run()
{
ITensorPack pack;
pack.add_tensor(TensorType::ACL_SRC, _impl->src);
diff --git a/src/runtime/NEON/functions/NEFFT1D.cpp b/src/runtime/NEON/functions/NEFFT1D.cpp
index e72488f0f6..fb75f9da29 100644
--- a/src/runtime/NEON/functions/NEFFT1D.cpp
+++ b/src/runtime/NEON/functions/NEFFT1D.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
@@ -36,7 +38,15 @@ namespace arm_compute
NEFFT1D::~NEFFT1D() = default;
NEFFT1D::NEFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _axis(0), _run_scale(false)
+ : _memory_group(std::move(memory_manager)),
+ _digit_reverse_kernel(),
+ _fft_kernels(),
+ _scale_kernel(),
+ _digit_reversed_input(),
+ _digit_reverse_indices(),
+ _num_ffts(0),
+ _axis(0),
+ _run_scale(false)
{
}
@@ -44,6 +54,7 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(NEFFT1D::validate(input->info(), output->info(), config));
+ ARM_COMPUTE_LOG_PARAMS(input, output, config);
// Decompose size to radix factors
const auto supported_radix = NEFFTRadixStageKernel::supported_radix();
@@ -72,7 +83,7 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &
_fft_kernels.resize(_num_ffts);
_axis = config.axis;
- for(unsigned int i = 0; i < _num_ffts; ++i)
+ for (unsigned int i = 0; i < _num_ffts; ++i)
{
const unsigned int radix_for_stage = decomposed_vector.at(i);
@@ -82,19 +93,21 @@ void NEFFT1D::configure(const ITensor *input, ITensor *output, const FFT1DInfo &
fft_kernel_info.Nx = Nx;
fft_kernel_info.is_first_stage = (i == 0);
_fft_kernels[i] = std::make_unique<NEFFTRadixStageKernel>();
- _fft_kernels[i]->configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+ _fft_kernels[i]->configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr,
+ fft_kernel_info);
Nx *= radix_for_stage;
}
// Configure scale kernel
- if(_run_scale)
+ if (_run_scale)
{
FFTScaleKernelInfo scale_config;
scale_config.scale = static_cast<float>(N);
scale_config.conjugate = config.direction == FFTDirection::Inverse;
_scale_kernel = std::make_unique<NEFFTScaleKernel>();
- is_c2r ? _scale_kernel->configure(&_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config);
+ is_c2r ? _scale_kernel->configure(&_digit_reversed_input, output, scale_config)
+ : _scale_kernel->configure(output, nullptr, scale_config);
}
// Allocate tensors
@@ -111,7 +124,7 @@ Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2);
- ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+ ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
// Check if FFT is decomposable
const auto supported_radix = NEFFTRadixStageKernel::supported_radix();
@@ -120,7 +133,7 @@ Status NEFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, co
ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty());
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
// All combinations are supported except real input with real output (i.e., both input channels set to 1)
ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1);
@@ -138,13 +151,13 @@ void NEFFT1D::run()
NEScheduler::get().schedule(_digit_reverse_kernel.get(), (_axis == 0 ? Window::DimY : Window::DimZ));
- for(unsigned int i = 0; i < _num_ffts; ++i)
+ for (unsigned int i = 0; i < _num_ffts; ++i)
{
NEScheduler::get().schedule(_fft_kernels[i].get(), (_axis == 0 ? Window::DimY : Window::DimX));
}
// Run output scaling
- if(_run_scale)
+ if (_run_scale)
{
NEScheduler::get().schedule(_scale_kernel.get(), Window::DimY);
}
diff --git a/src/runtime/NEON/functions/NEFFT2D.cpp b/src/runtime/NEON/functions/NEFFT2D.cpp
index 3b787cd523..066909221d 100644
--- a/src/runtime/NEON/functions/NEFFT2D.cpp
+++ b/src/runtime/NEON/functions/NEFFT2D.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,16 +26,18 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/Scheduler.h"
-#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
-#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
-#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
+
+#include "src/common/utils/Log.h"
namespace arm_compute
{
NEFFT2D::~NEFFT2D() = default;
NEFFT2D::NEFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
+ : _memory_group(memory_manager),
+ _first_pass_func(memory_manager),
+ _second_pass_func(memory_manager),
+ _first_pass_tensor()
{
}
@@ -43,6 +45,7 @@ void NEFFT2D::configure(const ITensor *input, ITensor *output, const FFT2DInfo &
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(NEFFT2D::validate(input->info(), output->info(), config));
+ ARM_COMPUTE_LOG_PARAMS(input, output, config);
// Setup first pass
FFT1DInfo first_pass_config;
@@ -79,7 +82,7 @@ Status NEFFT2D::validate(const ITensorInfo *input, const ITensorInfo *output, co
ARM_COMPUTE_RETURN_ON_ERROR(NEFFT1D::validate(&first_pass_tensor, output, second_pass_config));
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
diff --git a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
index 56fc2e4a2b..94f85e5ffa 100644
--- a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
@@ -25,14 +25,16 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
#include "src/core/NEON/kernels/NEPadLayerKernel.h"
#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/utils/helpers/fft.h"
namespace arm_compute
@@ -45,11 +47,11 @@ int pad_decomposable(int N)
int pad = 0;
bool is_decomposed = false;
- while(!is_decomposed)
+ while (!is_decomposed)
{
const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix);
is_decomposed = !decomposed_vector.empty();
- if(!is_decomposed)
+ if (!is_decomposed)
{
++pad;
}
@@ -101,10 +103,16 @@ NEFFTConvolutionLayer::NEFFTConvolutionLayer(std::shared_ptr<IMemoryManager> mem
}
NEFFTConvolutionLayer::~NEFFTConvolutionLayer() = default;
-void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info, bool enable_fast_math)
+void NEFFTConvolutionLayer::configure(ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
ARM_COMPUTE_UNUSED(enable_fast_math);
+ ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math);
_original_weights = weights;
_original_bias = biases;
@@ -113,21 +121,24 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
_has_bias = biases != nullptr;
// Get indices for the width and height
- const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+ const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+ const size_t idx_height =
+ get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
// Input shape, kernel size and output tile
- const Size2D input_dims = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
- const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
- const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
- pad_decomposable(input_dims.y() + kernel_size.y() - 1));
+ const Size2D input_dims =
+ Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+ const Size2D kernel_size =
+ Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+ const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
+ pad_decomposable(input_dims.y() + kernel_size.y() - 1));
// Tensors to use
ITensor *input_to_use = input;
const ITensor *weights_to_use = weights;
ITensor *output_to_use = _has_bias ? &_bias_output : output;
// Permute bias
- if(biases != nullptr)
+ if (biases != nullptr)
{
_permute_bias_func.configure(biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
_permuted_bias.info()->set_data_layout(DataLayout::NCHW);
@@ -135,7 +146,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
// Permute input if needed
_needs_permute = input->info()->data_layout() == DataLayout::NHWC;
- if(_needs_permute)
+ if (_needs_permute)
{
_memory_group.manage(&_permuted_input);
// Configure the function to transform the input tensor from NHWC -> NCHW
@@ -156,7 +167,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
_flip_weights_func.configure(weights_to_use, &_flipped_weights, &_flip_axis);
// Pad weights
- const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
+ const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}};
_pad_weights_func.configure(&_flipped_weights, &_padded_weights, padding_w);
// Transform weights
@@ -164,10 +175,10 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
_transform_weights_func->configure(&_padded_weights, &_transformed_weights, FFT2DInfo());
// Pad input
- const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
+ const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}};
_memory_group.manage(&_padded_input);
_pad_input_func.configure(input_to_use, &_padded_input, padding_in);
- if(_needs_permute)
+ if (_needs_permute)
{
_permuted_input.allocator()->allocate();
}
@@ -191,7 +202,8 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
_memory_group.manage(&_itransformed_output);
FFT2DInfo itranform_info;
itranform_info.direction = FFTDirection::Inverse;
- _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
+ _itransformed_output.allocator()->init(
+ _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
_itransform_output_func.configure(&_output_reduced, &_itransformed_output, itranform_info);
_output_reduced.allocator()->allocate();
@@ -203,26 +215,29 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
// Extract correct region
const int start_left = kernel_size.x() - conv_info.pad_left() - 1;
const int start_top = kernel_size.y() - conv_info.pad_top() - 1;
- const int end_right = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
- const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
- if(_has_bias)
+ const int end_right =
+ _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
+ const int end_botton =
+ _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
+ if (_has_bias)
{
_memory_group.manage(&_bias_output);
}
- else if(_needs_permute)
+ else if (_needs_permute)
{
output_to_use = &_permuted_output;
_memory_group.manage(&_permuted_output);
}
- _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+ _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top),
+ Coordinates(end_right, end_botton));
_reshaped_output.allocator()->allocate();
_itransformed_output.allocator()->allocate();
// Add bias
- if(biases != nullptr)
+ if (biases != nullptr)
{
output_to_use = output;
- if(_needs_permute)
+ if (_needs_permute)
{
output_to_use = &_permuted_output;
_memory_group.manage(&_permuted_output);
@@ -233,7 +248,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
}
// Permute output
- if(_needs_permute)
+ if (_needs_permute)
{
// Configure the function to transform the convoluted output to ACL's native ordering format NCHW
_permuted_output.info()->set_data_layout(DataLayout::NCHW);
@@ -245,7 +260,7 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
// Configure Activation Layer
_is_activationlayer_enabled = act_info.enabled();
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
_activation_layer_func.configure(output, nullptr, act_info);
}
@@ -258,8 +273,13 @@ void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, co
axis_data[1] = 1;
}
-Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status NEFFTConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
ARM_COMPUTE_UNUSED(enable_fast_math);
@@ -277,11 +297,13 @@ Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn
const auto strides = conv_info.stride();
ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1);
ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y());
- ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2));
- ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2));
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) ||
+ conv_info.pad_right() != (kernel_size.x() / 2));
+ ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) ||
+ conv_info.pad_bottom() != (kernel_size.y() / 2));
// Validate biases
- if(biases != nullptr)
+ if (biases != nullptr)
{
const size_t idx_channels = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
@@ -289,13 +311,14 @@ Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorIn
}
// Checks performed when output is configured
- if((output != nullptr) && (output->total_size() != 0))
+ if ((output != nullptr) && (output->total_size() != 0))
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
+ ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) ||
+ (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
// Validate Activation Layer
- if(act_info.enabled())
+ if (act_info.enabled())
{
ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
}
@@ -311,7 +334,7 @@ void NEFFTConvolutionLayer::run()
MemoryGroupResourceScope scope_mg(_memory_group);
// Transform input
- if(_needs_permute)
+ if (_needs_permute)
{
_permute_input_func.run();
}
@@ -329,17 +352,17 @@ void NEFFTConvolutionLayer::run()
_extract_output_func.run();
// Add bias
- if(_has_bias)
+ if (_has_bias)
{
_bias_add_func.run();
}
- if(_needs_permute)
+ if (_needs_permute)
{
_permute_output_func.run();
}
// Run activation layer
- if(_is_activationlayer_enabled)
+ if (_is_activationlayer_enabled)
{
_activation_layer_func.run();
}
@@ -347,10 +370,10 @@ void NEFFTConvolutionLayer::run()
void NEFFTConvolutionLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
// Permute bias to NCHW
- if(_original_bias != nullptr)
+ if (_original_bias != nullptr)
{
_permuted_bias.allocator()->allocate();
_permute_bias_func.run();
@@ -360,7 +383,7 @@ void NEFFTConvolutionLayer::prepare()
const ITensor *cur_weights = _original_weights;
// Permute weights
- if(_needs_permute)
+ if (_needs_permute)
{
ARM_COMPUTE_ERROR_ON(!cur_weights->is_used());
diff --git a/src/runtime/NEON/functions/NEFill.cpp b/src/runtime/NEON/functions/NEFill.cpp
index ee539fdfc8..bc1d5b7f5c 100644
--- a/src/runtime/NEON/functions/NEFill.cpp
+++ b/src/runtime/NEON/functions/NEFill.cpp
@@ -24,7 +24,8 @@
#include "arm_compute/runtime/NEON/functions/NEFill.h"
#include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuFill.h"
+
+#include "src/cpu/operators/CpuFill.h"
#include <utility>
@@ -32,15 +33,14 @@ namespace arm_compute
{
struct NEFill::Impl
{
- ITensor *tensor{ nullptr };
- std::unique_ptr<cpu::CpuFill> op{ nullptr };
+ ITensor *tensor{nullptr};
+ std::unique_ptr<cpu::CpuFill> op{nullptr};
};
-NEFill::NEFill()
- : _impl(std::make_unique<Impl>())
+NEFill::NEFill() : _impl(std::make_unique<Impl>())
{
}
-NEFill::NEFill(NEFill &&) = default;
+NEFill::NEFill(NEFill &&) = default;
NEFill &NEFill::operator=(NEFill &&) = default;
NEFill::~NEFill() = default;
diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp
index 256aad6d3f..a3ab9c3db4 100644
--- a/src/runtime/NEON/functions/NEFillBorder.cpp
+++ b/src/runtime/NEON/functions/NEFillBorder.cpp
@@ -25,17 +25,22 @@
#include "arm_compute/core/Window.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEFillBorderKernel.h"
namespace arm_compute
{
-NEFillBorder::NEFillBorder()
- : _border_handler(nullptr)
+NEFillBorder::NEFillBorder() : _border_handler(nullptr)
{
}
-void NEFillBorder::configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
+void NEFillBorder::configure(ITensor *input,
+ unsigned int border_width,
+ BorderMode border_mode,
+ const PixelValue &constant_border_value)
{
+ ARM_COMPUTE_LOG_PARAMS(input, border_width, border_mode, constant_border_value);
_border_handler = std::make_unique<NEFillBorderKernel>();
_border_handler->configure(input, BorderSize(border_width), border_mode, constant_border_value);
}
@@ -44,4 +49,4 @@ void NEFillBorder::run()
{
NEScheduler::get().schedule(_border_handler.get(), Window::DimZ);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp
index 4d1054ad25..56db2be3fa 100644
--- a/src/runtime/NEON/functions/NEFlattenLayer.cpp
+++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp
@@ -24,25 +24,25 @@
#include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
#include "src/core/helpers/AutoConfiguration.h"
-#include "src/runtime/cpu/operators/CpuFlatten.h"
+#include "src/cpu/operators/CpuFlatten.h"
namespace arm_compute
{
struct NEFlattenLayer::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuFlatten> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuFlatten> op{nullptr};
};
-NEFlattenLayer::NEFlattenLayer()
- : _impl(std::make_unique<Impl>())
+NEFlattenLayer::NEFlattenLayer() : _impl(std::make_unique<Impl>())
{
}
-NEFlattenLayer::NEFlattenLayer(NEFlattenLayer &&) = default;
+NEFlattenLayer::NEFlattenLayer(NEFlattenLayer &&) = default;
NEFlattenLayer &NEFlattenLayer::operator=(NEFlattenLayer &&) = default;
NEFlattenLayer::~NEFlattenLayer() = default;
@@ -51,7 +51,8 @@ void NEFlattenLayer::configure(const ITensor *input, ITensor *output)
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
_impl->src = input;
_impl->dst = output;
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input->info())));
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(
+ misc::shape_calculator::compute_flatten_shape(input->info())));
_impl->op = std::make_unique<cpu::CpuFlatten>();
_impl->op->configure(_impl->src->info(), _impl->dst->info());
@@ -60,9 +61,10 @@ void NEFlattenLayer::configure(const ITensor *input, ITensor *output)
Status NEFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
{
// Checks performed when output is configured
- if(output->total_size() != 0)
+ if (output->total_size() != 0)
{
- const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
+ const TensorInfo tensor_info_output =
+ input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
}
return cpu::CpuFlatten::validate(input, output);
diff --git a/src/runtime/NEON/functions/NEFloor.cpp b/src/runtime/NEON/functions/NEFloor.cpp
index f8a3c13d6d..112c93c478 100644
--- a/src/runtime/NEON/functions/NEFloor.cpp
+++ b/src/runtime/NEON/functions/NEFloor.cpp
@@ -24,22 +24,22 @@
#include "arm_compute/runtime/NEON/functions/NEFloor.h"
#include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuFloor.h"
+
+#include "src/cpu/operators/CpuFloor.h"
namespace arm_compute
{
struct NEFloor::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuFloor> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuFloor> op{nullptr};
};
-NEFloor::NEFloor()
- : _impl(std::make_unique<Impl>())
+NEFloor::NEFloor() : _impl(std::make_unique<Impl>())
{
}
-NEFloor::NEFloor(NEFloor &&) = default;
+NEFloor::NEFloor(NEFloor &&) = default;
NEFloor &NEFloor::operator=(NEFloor &&) = default;
NEFloor::~NEFloor() = default;
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index f469a0bdab..2656d0fa0f 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,469 +23,138 @@
*/
#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "src/core/cpu/kernels/CpuTransposeKernel.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
-#include <cmath>
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuFullyConnected.h"
namespace arm_compute
{
-using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::experimental;
-namespace
+struct NEFullyConnectedLayer::Impl
{
-// Get min, max bound of a quantized assymetric output tensor, with the effect of fused activation
-std::pair<PixelValue, PixelValue> get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, const ActivationLayerInfo &act_info, DataType data_type)
-{
- PixelValue type_min{};
- PixelValue type_max{};
- std::tie(type_min, type_max) = get_min_max(data_type);
- const UniformQuantizationInfo q_unif = q_info.uniform();
-
- if(act_info.enabled())
- {
- switch(act_info.activation())
- {
- case ActivationLayerInfo::ActivationFunction::RELU:
- type_min = PixelValue(q_unif.offset);
- break;
- case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
- type_min = PixelValue(q_unif.offset);
- type_max = PixelValue(act_info.a(), data_type, q_info);
- break;
- case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
- type_min = PixelValue(act_info.b(), data_type, q_info);
- type_max = PixelValue(act_info.a(), data_type, q_info);
- break;
- default:
- ARM_COMPUTE_ERROR("Activation function not supported.");
- break;
- }
- }
-
- return std::make_pair(type_min, type_max);
-}
+ MemoryGroup memory_group{};
+ IWeightsManager *weights_manager{nullptr};
-Status get_gemmlowp_output_stage_info(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const ActivationLayerInfo &act,
- GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
-{
- const auto data_type = input->data_type();
- const QuantizationInfo oq_info = output->quantization_info();
- const UniformQuantizationInfo iq_unif = input->quantization_info().uniform();
- const UniformQuantizationInfo wq_unif = weights->quantization_info().uniform();
- const UniformQuantizationInfo oq_unif = oq_info.uniform();
-
- float multiplier = (iq_unif.scale * wq_unif.scale) / oq_unif.scale;
- int32_t output_multiplier;
- int32_t output_shift;
-
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
-
- PixelValue type_min{};
- PixelValue type_max{};
- std::tie(type_min, type_max) = get_quantized_asymmetric_output_min_max(oq_info, act, data_type);
+ std::unique_ptr<cpu::CpuFullyConnected> op{nullptr};
- gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier;
- gemmlowp_output_stage_info.gemmlowp_shift = output_shift;
- gemmlowp_output_stage_info.gemmlowp_offset = oq_unif.offset;
- gemmlowp_output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
- gemmlowp_output_stage_info.gemmlowp_min_bound = type_min.get<int32_t>();
- gemmlowp_output_stage_info.gemmlowp_max_bound = type_max.get<int32_t>();
+ const ITensor *original_weights{nullptr};
- return Status{};
-}
-
-Status validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ActivationLayerInfo &act)
-{
- if(is_data_type_quantized_asymmetric(input->data_type()))
- {
- // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
- // Extract and negate input and weights offset
- const QuantizationInfo input_quantization_info(input->quantization_info().uniform().scale, -input->quantization_info().uniform().offset);
- const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset);
+ ITensorPack run_pack{};
+ WorkspaceData<Tensor> workspace{};
+ experimental::MemoryRequirements aux_mem_req{};
- GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
- ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(input, weights, output, act, gemmlowp_output_stage_info));
-
- GEMMInfo gemm_info;
- gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info);
-
- // Validate gemmlowp function
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_quantization_info(input_quantization_info),
- &weights->clone()->set_quantization_info(weights_quantization_info),
- biases,
- output,
- gemm_info));
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(input, weights, biases, output, 1.f, 1.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */)));
- }
-
- return Status{};
-}
-} // namespace
+ bool is_prepared{false};
+ bool dynamic_weights{false};
+};
NEFullyConnectedLayer::~NEFullyConnectedLayer() = default;
-NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
- : _memory_group(std::move(memory_manager)), _weights_manager(weights_manager), _flatten(), _convert_weights(), _convert_weights_managed(), _reshape_weights_function(),
- _reshape_weights_managed_function(), _mm_gemm(nullptr, weights_manager), _mm_gemmlowp(nullptr, weights_manager), _flatten_output(), _converted_weights_output(), _reshape_weights_output(),
- _original_weights(nullptr), _are_weights_converted(true), _are_weights_reshaped(false), _is_fc_after_conv(false), _is_quantized_asymmetric(false), _is_prepared(false)
-{
-}
-
-void NEFullyConnectedLayer::configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act)
-{
- if(_is_quantized_asymmetric)
- {
- // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
- // Extract and negate input and weights offset
- const QuantizationInfo input_quantization_info = input->info()->quantization_info();
- const QuantizationInfo weights_quantization_info = weights->info()->quantization_info();
-
- input->info()->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
- weights->info()->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
-
- // Configure gemmlowp function and output stage for asymmetric quantized types
- GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
- const Status status = get_gemmlowp_output_stage_info(input->info(), weights->info(), output->info(), act, gemmlowp_output_stage_info);
- ARM_COMPUTE_ERROR_ON(status.error_code() != ErrorCode::OK);
-
- GEMMInfo gemm_info;
- gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info);
- gemm_info.set_activation_info(act);
- _mm_gemmlowp.configure(input, weights, biases, output, gemm_info);
-
- // Revert back QuantizatioInfo as input and weights could be used in other fully connected layers
- input->info()->set_quantization_info(input_quantization_info);
- weights->info()->set_quantization_info(weights_quantization_info);
- }
- else
- {
- // Configure matrix multiply kernel
- GEMMInfo gemm_info(false, false, true /* Reshape weights only for the first run */);
- gemm_info.set_activation_info(act);
- _mm_gemm.configure(input, weights, biases, output, 1.f, 1.0f, gemm_info);
- }
-}
-
-void NEFullyConnectedLayer::configure_conv_fc(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act)
-{
- ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
-
- // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
-
- // Initialize output tensor for flatten
- TensorShape shape_flatten = compute_flatten_shape(input->info());
- _flatten_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
-
- // Configure flatten kernel
- _memory_group.manage(&_flatten_output);
-
- _flatten.configure(input, &_flatten_output);
-
- // Configure matrix multiply kernel
- configure_mm(&_flatten_output, weights, biases, output, act);
-
- // Allocate the output tensor for flatten once all the configure methods have been called
- _flatten_output.allocator()->allocate();
-}
-
-void NEFullyConnectedLayer::configure_fc_fc(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act)
+NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager,
+ IWeightsManager *weights_manager)
+ : _impl(std::make_unique<Impl>())
{
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
-
- // Configure matrix multiply kernel
- configure_mm(input, weights, biases, output, act);
+ _impl->memory_group = MemoryGroup(std::move(memory_manager));
+ _impl->weights_manager = weights_manager;
}
-void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,
- FullyConnectedLayerInfo fc_info)
+void NEFullyConnectedLayer::configure(const ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ FullyConnectedLayerInfo fc_info,
+ const WeightsInfo &weights_info)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(),
- weights->info(),
+ ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(), weights->info(),
biases != nullptr ? biases->info() : nullptr,
- output->info(),
- fc_info));
+ output->info(), fc_info, weights_info));
+ ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, fc_info);
- _are_weights_converted = true;
- _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
- _is_fc_after_conv = true;
- _is_quantized_asymmetric = is_data_type_quantized_asymmetric(input->info()->data_type());
- _original_weights = weights;
+ _impl->op = std::make_unique<cpu::CpuFullyConnected>();
+ _impl->original_weights = weights;
+ _impl->is_prepared = false;
- if(_weights_manager)
- {
- _weights_manager->manage(weights);
- }
-
- // With the Fully Connected layer we can have 4 different cases:
- // 1) Convolution layer -> Fully Connected layer without batches
- // 2) Fully Connected layer -> Fully Connected layer without batches
- // 3) Convolution layer -> Fully Connected layer with batches
- // 4) Fully Connected layer -> Fully Connected layer with batches
-
- const ITensor *weights_to_use = weights;
-
- // Check if we have a fully connected layer with batches
- const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
- if(is_batched_fc_layer)
- {
- _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
- input->info()->tensor_shape().cend(),
- output->info()->tensor_shape().cbegin() + 1));
- }
- else
- {
- _is_fc_after_conv = input->info()->num_dimensions() > 1;
- }
-
- // Reshape weights if needed
- if(!_are_weights_reshaped)
- {
- if(_weights_manager && _weights_manager->are_weights_managed(weights))
- {
- _reshape_weights_managed_function.configure(weights);
- weights_to_use = _weights_manager->acquire(weights, &_reshape_weights_managed_function);
- }
- else
- {
- // Reshape the weights
- _reshape_weights_function.configure(weights, &_reshape_weights_output);
- weights_to_use = &_reshape_weights_output;
- }
- }
+ _impl->op->configure(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
+ fc_info, weights_info);
- // Convert weights if needed
- if(_is_fc_after_conv && (input->info()->data_layout() != fc_info.weights_trained_layout))
+ if (_impl->weights_manager != nullptr)
{
- if(_weights_manager && _weights_manager->are_weights_managed(weights_to_use))
- {
- _convert_weights_managed.configure(weights_to_use,
- input->info()->tensor_shape(),
- fc_info.weights_trained_layout);
- weights_to_use = _weights_manager->acquire(weights, &_convert_weights_managed);
- }
- else
- {
- // Convert weights
- _convert_weights.configure(weights_to_use,
- &_converted_weights_output,
- input->info()->tensor_shape(),
- fc_info.weights_trained_layout);
-
- weights_to_use = &_converted_weights_output;
- }
- _are_weights_converted = false;
+ _impl->weights_manager->manage(_impl->original_weights);
}
- if(_is_fc_after_conv)
- {
- // Fully Connected layer after a Convolution Layer without batches
- configure_conv_fc(input, weights_to_use, biases, output, fc_info.activation_info);
- }
- else
- {
- // Fully Connected layer after a Fully Connected Layer without batches
- configure_fc_fc(input, weights_to_use, biases, output, fc_info.activation_info);
- }
+ _impl->aux_mem_req = _impl->op->workspace();
+ _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+ _impl->workspace =
+ manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
- _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
+ _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights &&
+ !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights;
}
-Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- FullyConnectedLayerInfo fc_info)
+Status NEFullyConnectedLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+ const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const FullyConnectedLayerInfo &fc_info,
+ const WeightsInfo &weights_info)
{
- ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
- ARM_COMPUTE_RETURN_ERROR_ON(biases != nullptr && biases->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(input->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
- && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
-
- bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
- bool is_fc_after_conv = true;
-
- const ITensorInfo &flatten_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(input)));
- const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
- const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
-
- // With the Fully Connected layer we can have 4 different cases:
- // 1) Convolution layer -> Fully Connected layer without batches
- // 2) Fully Connected layer -> Fully Connected layer without batches
- // 3) Convolution layer -> Fully Connected layer with batches
- // 4) Fully Connected layer -> Fully Connected layer with batches
-
- const ITensorInfo *input_to_use = input;
- const ITensorInfo *weights_to_use = weights;
-
- // Check if we have a fully connected layer with batches
- const bool is_batched_fc_layer = output->dimension(1) > 1;
-
- if(is_batched_fc_layer)
- {
- is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->tensor_shape().cbegin() + 3,
- input->tensor_shape().cend(),
- output->tensor_shape().cbegin() + 1));
- }
- else
- {
- is_fc_after_conv = input->num_dimensions() > 1;
- }
-
- if(!weights_reshaped)
- {
- // Validate reshape weights kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(weights, &reshaped_weights));
- weights_to_use = &reshaped_weights;
- }
-
- if(is_fc_after_conv && (input->data_layout() != fc_info.weights_trained_layout))
- {
- // Validate convert weights kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEConvertFullyConnectedWeights::validate(weights_to_use,
- &converted_weights,
- input->tensor_shape(),
- fc_info.weights_trained_layout));
- weights_to_use = &converted_weights;
- }
-
- if(is_fc_after_conv)
- {
- // Fully Connected layer after a Convolution Layer without batches
- ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (input->dimension(0) * input->dimension(1) * input->dimension(2))));
-
- // Validate flatten kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayer::validate(input, &flatten_input));
- input_to_use = &flatten_input;
- }
- else
- {
- // Fully Connected layer after a Fully Connected Layer without batches
- ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
- }
- // Validate matrix multiply kernel
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(input_to_use, weights_to_use, biases, output, fc_info.activation_info));
+ return cpu::CpuFullyConnected::has_opt_impl(expected_weight_format, input, weights, biases, output, fc_info,
+ weights_info);
+}
- return Status{};
+Status NEFullyConnectedLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ FullyConnectedLayerInfo fc_info,
+ const WeightsInfo &weights_info)
+{
+ return cpu::CpuFullyConnected::validate(input, weights, biases, output, fc_info, weights_info);
}
void NEFullyConnectedLayer::run()
{
- prepare();
-
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- // Linearize input if it comes from a convolutional layer
- if(_is_fc_after_conv)
+ if (!_impl->dynamic_weights)
{
- _flatten.run();
+ prepare();
}
- // Run matrix multiply
- if(_is_quantized_asymmetric)
- {
- _mm_gemmlowp.run();
- }
- else
- {
- _mm_gemm.run();
- }
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
+ _impl->op->run(_impl->run_pack);
}
void NEFullyConnectedLayer::prepare()
{
- if(!_is_prepared)
+ if (!_impl->is_prepared)
{
- if(!_weights_manager)
- {
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
- }
-
- auto release_unused = [](Tensor * w)
- {
- if(!w->is_used())
- {
- w->allocator()->free();
- }
- };
+ _impl->op->prepare(_impl->run_pack);
- // Pointer to current weights
- const ITensor *cur_weights = _original_weights;
+ // Release temporary tensors that are only used in prepare stage
+ release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace);
+ _impl->is_prepared = true;
- // Reshape of the weights (happens only once)
- if(!_are_weights_reshaped)
+ // Handle weights managed infrastructure
+ if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights))
{
- if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
+ // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare
+ // This is for cases where multiple functions share the same b (weights)
+ // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference
+ const ITensor *original_b = _impl->original_weights;
+ if (!original_b->is_used())
{
- cur_weights = _weights_manager->run(cur_weights, &_reshape_weights_managed_function);
+ _impl->weights_manager->pre_mark_as_unused(original_b);
}
- else
- {
- // Reshape of the weights (happens only once)
- if(!_are_weights_reshaped)
- {
- // Run reshape weights kernel and mark weights as unused
- _reshape_weights_output.allocator()->allocate();
- _reshape_weights_function.run();
- }
- cur_weights->mark_as_unused();
- cur_weights = &_reshape_weights_output;
- }
- _are_weights_reshaped = true;
+ _impl->original_weights->mark_as_used();
+ _impl->weights_manager->release(_impl->original_weights);
}
-
- // Convert weights if needed (happens only once)
- if(!_are_weights_converted)
- {
- if(_weights_manager && _weights_manager->are_weights_managed(cur_weights))
- {
- _weights_manager->run(cur_weights, &_convert_weights_managed);
- }
- else
- {
- _converted_weights_output.allocator()->allocate();
- _convert_weights.run();
- cur_weights->mark_as_unused();
- }
-
- _are_weights_converted = true;
- }
-
- // Release reshaped weights if unused
- release_unused(&_reshape_weights_output);
-
- // Prepare GEMM prepare and release unused weights
- if(!_is_quantized_asymmetric)
- {
- _mm_gemm.prepare();
- }
-
- // Release converted weights if unused
- release_unused(&_reshape_weights_output);
- release_unused(&_converted_weights_output);
-
- _is_prepared = true;
}
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
index a8ce6b2bfc..f5b8b57dac 100644
--- a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
+++ b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,32 +28,50 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
namespace arm_compute
{
NEFuseBatchNormalization::~NEFuseBatchNormalization() = default;
-NEFuseBatchNormalization::NEFuseBatchNormalization()
- : _fuse_bn_kernel()
+NEFuseBatchNormalization::NEFuseBatchNormalization() : _fuse_bn_kernel()
{
}
-void NEFuseBatchNormalization::configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var,
- ITensor *fused_weights, ITensor *fused_bias,
- const ITensor *input_bias, const ITensor *bn_beta, const ITensor *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+void NEFuseBatchNormalization::configure(const ITensor *input_weights,
+ const ITensor *bn_mean,
+ const ITensor *bn_var,
+ ITensor *fused_weights,
+ ITensor *fused_bias,
+ const ITensor *input_bias,
+ const ITensor *bn_beta,
+ const ITensor *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
+ ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma,
+ epsilon, fbn_type);
+
_fuse_bn_kernel = std::make_unique<NEFuseBatchNormalizationKernel>();
- _fuse_bn_kernel->configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+ _fuse_bn_kernel->configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma,
+ epsilon, fbn_type);
}
-Status NEFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
- const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
- const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
- float epsilon, FuseBatchNormalizationType fbn_type)
+Status NEFuseBatchNormalization::validate(const ITensorInfo *input_weights,
+ const ITensorInfo *bn_mean,
+ const ITensorInfo *bn_var,
+ const ITensorInfo *fused_weights,
+ const ITensorInfo *fused_bias,
+ const ITensorInfo *input_bias,
+ const ITensorInfo *bn_beta,
+ const ITensorInfo *bn_gamma,
+ float epsilon,
+ FuseBatchNormalizationType fbn_type)
{
- return NEFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+ return NEFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+ input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
}
void NEFuseBatchNormalization::run()
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 7318c3e492..934a8250cc 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,382 +23,140 @@
*/
#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
-#include <cmath>
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuGemm.h"
-using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::experimental;
namespace arm_compute
{
-namespace
+struct NEGEMM::Impl
{
-cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
-{
- cpu::AsmGemmInfo asm_info;
- asm_info.method = cpu::AsmConvMethod::Im2Col;
- asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
- asm_info.depth_output_gemm3d = info.depth_output_gemm3d();
- asm_info.activation_info = info.activation_info();
+ MemoryGroup memory_group{};
+ IWeightsManager *weights_manager{nullptr};
- return asm_info;
-}
-} // namespace
+ std::unique_ptr<cpu::CpuGemm> op{nullptr};
+
+ const ITensor *original_b{nullptr};
+ bool is_prepared{false};
+
+ ITensorPack run_pack{};
+ ITensorPack prep_pack{};
+ WorkspaceData<Tensor> workspace{};
+ experimental::MemoryRequirements aux_mem_req{};
+};
NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
- : _memory_group(memory_manager), _weights_manager(weights_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(std::make_unique<cpu::CpuGemmAssemblyDispatch>()), _ma_kernel(),
- _alpha_scale_func(nullptr), _add_bias(), _activation_func(), _tmp_a(), _tmp_b(), _tmp_d(), _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_alpha_scale(false),
- _run_addition(false), _run_bias_addition(false), _run_activation(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
+ : _impl(std::make_unique<Impl>())
{
+ _impl->memory_group = MemoryGroup(std::move(memory_manager));
+ _impl->weights_manager = weights_manager;
}
NEGEMM::~NEGEMM() = default;
-void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info)
+void NEGEMM::configure(const ITensor *a,
+ const ITensor *b,
+ const ITensor *c,
+ ITensor *d,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info)
{
- ARM_COMPUTE_ERROR_THROW_ON(NEGEMM::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info));
-
- const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
- const bool is_c_bias = gemm_info.reshape_b_only_on_first_run();
- bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a->info(), b->info(), (is_c_bias && c != nullptr) ? c->info() : nullptr, d->info(), asm_info));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
+ ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemm::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr,
+ d->info(), alpha, beta, gemm_info));
// Check if we need to reshape the matrix B only on the first run
- _is_prepared = false;
- _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
- _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
- _original_b = b;
- _run_alpha_scale = alpha != 1.f;
- _run_bias_addition = c != nullptr && gemm_info.reshape_b_only_on_first_run();
- _run_addition = beta != 0 && c != nullptr && !gemm_info.reshape_b_only_on_first_run();
- _run_activation = gemm_info.activation_info().enabled() && (!run_optimised || (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info())));
+ _impl->is_prepared = false;
+ _impl->original_b = b;
+ _impl->op = std::make_unique<cpu::CpuGemm>();
- if(run_optimised)
+ // Make the B matrix dynamic values.
+ auto b_info_to_use = b->info()->clone();
+ if (!gemm_info.reshape_b_only_on_first_run())
{
- const ITensor *c_to_use = is_c_bias ? c : nullptr;
- const ITensorInfo *c_info_to_use = c_to_use != nullptr ? c_to_use->info() : nullptr;
- _asm_glue->configure(a->info(), b->info(), c_info_to_use, d->info(), asm_info);
- ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured());
-
- _asm_glue_tensors =
- {
- { ACL_SRC_0, a },
- { ACL_SRC_1, b },
- { ACL_SRC_2, c_to_use },
- { ACL_DST, d },
- };
-
- // Scale product by alpha
- if(_run_alpha_scale)
- {
- _alpha_scale_func.configure(d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f));
- }
+ b_info_to_use->set_are_values_constant(false);
}
- else
- {
- // Pick output tensor in case bias addition should be performed
- ITensor *gemm_output_to_use = d;
- if(_run_bias_addition)
- {
- gemm_output_to_use = &_tmp_d;
- _memory_group.manage(&_tmp_d);
- }
-
- _mm_kernel = std::make_unique<NEGEMMMatrixMultiplyKernel>();
-
- // Select between GEMV and GEMM
- if(_run_vector_matrix_multiplication)
- {
- // Configure the matrix multiply kernel
- _mm_kernel->configure(a, b, gemm_output_to_use, alpha, false);
- }
- else
- {
- TensorShape shape_tmp_a = a->info()->tensor_shape();
- TensorShape shape_tmp_b = b->info()->tensor_shape();
-
- shape_tmp_a.set(0, a->info()->dimension(0) * 4);
- shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f));
-
- const unsigned int transpose_w = 16 / data_size_from_type(b->info()->data_type());
- shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w);
- shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast<float>(transpose_w)));
-
- TensorInfo info_a = a->info()->clone()->set_tensor_shape(shape_tmp_a).set_is_resizable(true);
- TensorInfo info_b = b->info()->clone()->set_tensor_shape(shape_tmp_b).set_is_resizable(true);
-
- _tmp_a.allocator()->init(info_a);
- _tmp_b.allocator()->init(info_b);
-
- // Manage intermediate buffers
- _memory_group.manage(&_tmp_a);
- if(!_reshape_b_only_on_first_run)
- {
- _memory_group.manage(&_tmp_b);
- }
- int m = a->info()->dimension(1);
- int n = b->info()->dimension(0);
- int k = a->info()->dimension(0);
+ _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta,
+ gemm_info);
- // Configure interleave kernel
- _interleave_kernel = std::make_unique<NEGEMMInterleave4x4Kernel>();
- _interleave_kernel->configure(a, &_tmp_a);
-
- // Configure transpose kernel
- _transpose_kernel = std::make_unique<NEGEMMTranspose1xWKernel>();
- _transpose_kernel->configure(b, &_tmp_b);
-
- // Configure matrix multiplication kernel
- _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k));
-
- // Allocate once the all configure methods have been called
- _tmp_a.allocator()->allocate();
- if(!_reshape_b_only_on_first_run)
- {
- _tmp_b.allocator()->allocate();
- }
- }
-
- if(_run_bias_addition)
- {
- _add_bias.configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE);
- _tmp_d.allocator()->allocate();
- }
- }
-
- // Configure matrix addition kernel
- if(_run_addition)
- {
- _ma_kernel = std::make_unique<NEGEMMMatrixAdditionKernel>();
- _ma_kernel->configure(c, d, beta);
- }
-
- // Configure activation
- const ActivationLayerInfo &activation = gemm_info.activation_info();
- if(_run_activation)
- {
- _activation_func.configure(d, nullptr, activation);
- }
+ _impl->aux_mem_req = _impl->op->workspace();
+ _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_SRC_2, c}, {ACL_DST, d}};
+ _impl->prep_pack = {{ACL_SRC_1, b}, {ACL_SRC_2, c}};
+ _impl->workspace =
+ manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
}
-Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status NEGEMM::validate(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *output,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info)
{
- ARM_COMPUTE_UNUSED(alpha);
- const bool is_c_bias = gemm_info.reshape_b_only_on_first_run();
-
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::BFLOAT16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
- if(a->data_type() != DataType::BFLOAT16)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, output);
- }
-
- if(c != nullptr && !is_c_bias)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0);
- ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), "The C matrix must have the same number of columns as the matrix B");
- }
-
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
- if(gemm_info.depth_output_gemm3d() != 0)
- {
- if(gemm_info.reinterpret_input_as_3d())
- {
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
- }
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
- }
- }
-
- // Check if we need to run the optimized assembly kernel
- cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
- const bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, output, asm_info));
-
- if(!run_optimised)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
-
- // Check if the first input tensor is a vector.
- const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
- // Check if we need to reshape the matrix A and matrix B
- const bool run_interleave_transpose = !run_vector_matrix_multiplication && !(gemm_info.reshape_b_only_on_first_run());
-
- // Arguments used by GEMMReshapeInfo
- // If we pass the matrix A and matrix B reshaped to NEGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to NEGEMMReshapeInfo
- // in order to know how the matrices have been reshaped
- const int m = a->dimension(1);
- const int n = b->dimension(0);
- const int k = a->dimension(0);
- int mult_transpose1xW_width = 1;
- int mult_interleave4x4_height = 1;
-
- const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d());
-
- const ITensorInfo *matrix_a_info = a;
- const ITensorInfo *matrix_b_info = b;
-
- TensorInfo tmp_a_info{};
- TensorInfo tmp_b_info{};
- TensorInfo tmp_output_info = *output->clone();
-
- if(run_interleave_transpose)
- {
- matrix_a_info = &tmp_a_info;
- matrix_b_info = &tmp_b_info;
-
- // Validate interleave kernel
- auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &tmp_a_info));
-
- // Validate transpose kernel
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
- }
-
- // Validate matrix multiply
- auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
-
- if(c != nullptr && gemm_info.reshape_b_only_on_first_run())
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&tmp_output_info, c, output, ConvertPolicy::SATURATE));
- }
- }
-
- // Validate matrix addition kernel
- if(beta != 0 && c != nullptr && !is_c_bias)
+ // Make the B matrix dynamic values.
+ auto b_to_use = b->clone();
+ if (!gemm_info.reshape_b_only_on_first_run())
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixAdditionKernel::validate(c, output, beta));
+ b_to_use->set_are_values_constant(false);
}
- // Validate activation
- const ActivationLayerInfo &activation = gemm_info.activation_info();
- if(activation.enabled())
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, activation));
- }
+ return cpu::CpuGemm::validate(a, b_to_use.get(), c, output, alpha, beta, gemm_info);
+}
- return Status{};
+Status NEGEMM::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+ const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *output,
+ float alpha,
+ float beta,
+ const GEMMInfo &gemm_info)
+{
+ ARM_COMPUTE_UNUSED(alpha, beta);
+ return cpu::CpuGemm::has_opt_impl(expected_weight_format, a, b, c, output, gemm_info);
}
void NEGEMM::run()
{
prepare();
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- if(_asm_glue->is_configured())
- {
- _asm_glue->run(_asm_glue_tensors);
- if(_run_alpha_scale)
- {
- _alpha_scale_func.run();
- }
- }
- else
- {
- if(!_run_vector_matrix_multiplication)
- {
- // Run interleave kernel
- NEScheduler::get().schedule(_interleave_kernel.get(), Window::DimY);
-
- if(!_reshape_b_only_on_first_run)
- {
- // Run transpose kernel
- NEScheduler::get().schedule(_transpose_kernel.get(), Window::DimY);
- }
- }
-
- NEScheduler::get().schedule(_mm_kernel.get(), _run_vector_matrix_multiplication ? Window::DimX : Window::DimY);
-
- // Run bias addition kernel
- if(_run_bias_addition)
- {
- _add_bias.run();
- }
- }
-
- // Run matrix addition kernel
- if(_run_addition)
- {
- NEScheduler::get().schedule(_ma_kernel.get(), Window::DimY);
- }
-
- // Run activation function
- if(_run_activation)
- {
- _activation_func.run();
- }
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
+ _impl->op->run(_impl->run_pack);
}
void NEGEMM::prepare()
{
- if(!_is_prepared)
+ if (!_impl->is_prepared)
{
- const bool original_b_managed_by_weights_manager = _weights_manager && _weights_manager->are_weights_managed(_original_b);
- if(_asm_glue->is_configured())
- {
- if(!original_b_managed_by_weights_manager)
- {
- ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
- }
+ _impl->op->prepare(_impl->prep_pack);
+
+ auto has_reshape =
+ std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+ [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
- _asm_glue->prepare(_asm_glue_tensors);
- if(!original_b_managed_by_weights_manager)
- {
- _original_b->mark_as_unused();
- }
+ if (has_reshape != std::end(_impl->aux_mem_req))
+ {
+ _impl->original_b->mark_as_unused();
}
- else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
+ else
{
- if(!original_b_managed_by_weights_manager)
- {
- ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
- }
-
- _tmp_b.allocator()->allocate();
- NEScheduler::get().schedule(_transpose_kernel.get(), Window::DimY);
- if(!original_b_managed_by_weights_manager)
- {
- _original_b->mark_as_unused();
- }
+ _impl->run_pack.add_const_tensor(ACL_SRC_1, _impl->original_b);
}
- _is_prepared = true;
+ // Release temporary tensors that are only used in prepare stage
+ release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace);
+ _impl->is_prepared = true;
}
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
index 564ce2f514..6cca02eea9 100644
--- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
@@ -24,50 +24,93 @@
#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/runtime/cpu/operators/CpuGemmDirectConv2d.h"
+#include "arm_compute/runtime/Tensor.h"
-#include <set>
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuGemmDirectConv2d.h"
namespace arm_compute
{
using OperatorType = cpu::CpuGemmDirectConv2d;
+using namespace arm_compute::experimental;
struct NEGEMMConv2d::Impl
{
- ITensorPack tensors{};
- std::unique_ptr<OperatorType> op{ nullptr };
+ const ITensor *weights{nullptr};
+ std::unique_ptr<OperatorType> op{nullptr};
+ ITensorPack run_pack{};
+ ITensorPack prep_pack{};
+ WorkspaceData<Tensor> workspace{};
+ MemoryGroup memory_group{};
+ bool is_prepared{false};
+ experimental::MemoryRequirements aux_mem_req{};
};
-NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager)
- : _impl(std::make_unique<Impl>())
+NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager) : _impl(std::make_unique<Impl>())
{
- _impl->op = std::make_unique<OperatorType>(memory_manager);
+ _impl->memory_group = MemoryGroup(memory_manager);
}
NEGEMMConv2d::~NEGEMMConv2d() = default;
-void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info)
+void NEGEMMConv2d::configure(
+ ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info)
{
- _impl->tensors.add_const_tensor(TensorType::ACL_SRC_0, input);
- _impl->tensors.add_const_tensor(TensorType::ACL_SRC_1, weights);
- _impl->tensors.add_const_tensor(TensorType::ACL_SRC_2, biases);
- _impl->tensors.add_tensor(TensorType::ACL_DST, output);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), info);
+ _impl->weights = weights;
+ _impl->is_prepared = false;
+ _impl->op = std::make_unique<OperatorType>();
+
+ _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+ info);
+
+ _impl->aux_mem_req = _impl->op->workspace();
+ _impl->run_pack = {{TensorType::ACL_SRC_0, input}, {TensorType::ACL_SRC_2, biases}, {TensorType::ACL_DST, output}};
+ _impl->prep_pack = {{TensorType::ACL_SRC_1, weights}, {TensorType::ACL_SRC_2, biases}};
+ _impl->workspace =
+ manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
}
-Status NEGEMMConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info)
+Status NEGEMMConv2d::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const Conv2dInfo &info)
{
return OperatorType::validate(input, weights, biases, output, info);
}
+
void NEGEMMConv2d::run()
{
- _impl->op->run(_impl->tensors);
+ prepare();
+
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
+ _impl->op->run(_impl->run_pack);
}
+
void NEGEMMConv2d::prepare()
{
- _impl->op->prepare(_impl->tensors);
+ if (!_impl->is_prepared)
+ {
+ _impl->op->prepare(_impl->prep_pack);
+
+ auto has_reshape =
+ std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+ [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+
+ if (has_reshape != std::end(_impl->aux_mem_req))
+ {
+ _impl->weights->mark_as_unused();
+ }
+ else
+ {
+ _impl->run_pack.add_const_tensor(ACL_SRC_1, _impl->weights);
+ }
+
+ // Release temporary tensors that are only used in prepare stage
+ release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace);
+ _impl->is_prepared = true;
+ }
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 2876c254fa..c8f65d2fd9 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,617 +26,109 @@
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/Tensor.h"
-#include "src/core/NEON/kernels/NECol2ImKernel.h"
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "src/core/NEON/kernels/NEIm2ColKernel.h"
-#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuGemmConv2d.h"
-#include <set>
-#include <tuple>
+using namespace arm_compute::experimental;
namespace arm_compute
{
-using namespace arm_compute::misc::shape_calculator;
-
-NEConvolutionLayerReshapeWeights::~NEConvolutionLayerReshapeWeights() = default;
-NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights() noexcept
- : _weights_reshape_kernel()
-{
-}
-
-void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output)
-{
- // Perform validation step
- ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayerReshapeWeights::validate(weights->info(),
- (biases != nullptr) ? biases->info() : nullptr,
- output->info()));
- const bool append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
- const ITensor *biases_to_use = (append_biases) ? biases : nullptr;
-
- _weights_reshape_kernel = std::make_unique<NEWeightsReshapeKernel>();
- _weights_reshape_kernel->configure(weights, biases_to_use, output);
-
- output->info()->set_quantization_info(weights->info()->quantization_info());
-}
-
-Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output)
+struct NEGEMMConvolutionLayer::Impl
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1,
- DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL,
- DataType::BFLOAT16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
- if(biases != nullptr)
- {
- const int idx_kernels = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(weights->data_type()));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
- ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- }
-
- if((output != nullptr) && (output->total_size() != 0))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
-
- NEWeightsReshapeKernel::validate(weights, biases, output);
- }
-
- return Status{};
-}
-
-void NEConvolutionLayerReshapeWeights::run()
+ const ITensor *weights{nullptr};
+ std::unique_ptr<cpu::CpuGemmConv2d> op{nullptr};
+ ITensorPack run_pack{};
+ MemoryGroup memory_group{};
+ IWeightsManager *weights_manager{nullptr};
+ MemoryRequirements aux_mem_req{};
+ WorkspaceData<Tensor> workspace_tensors{};
+ bool is_prepared{false};
+};
+
+NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager,
+ IWeightsManager *weights_manager)
+ : _impl(std::make_unique<Impl>())
{
- NEScheduler::get().schedule(_weights_reshape_kernel.get(), 3);
+ _impl->weights_manager = weights_manager;
+ _impl->memory_group = MemoryGroup(memory_manager);
}
-
NEGEMMConvolutionLayer::~NEGEMMConvolutionLayer() = default;
-NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager, IWeightsManager *weights_manager)
- : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager),
- _col2im_kernel(), _reshape_layer(), _original_weights(nullptr), _original_output(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _gemm_output_3d(), _tmp_output(),
- _data_layout(DataLayout::NCHW), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_prepared(false)
+void NEGEMMConvolutionLayer::configure(const ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
{
-}
-
-void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act_info, int gemm_3d_depth)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
- ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output == nullptr ? nullptr : output->info(),
- act_info, gemm_3d_depth, _skip_im2col));
-
- // Create GEMMInfo structure
- const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
- gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
- false, GEMMLowpOutputStageInfo(), false, false, act_info);
-
- // Supported activations in GEMM
- const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
- };
-
- if(_is_quantized)
- {
- // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
- // Extract and negate input and weights offset
- const QuantizationInfo iqinfo = input->info()->quantization_info();
- const QuantizationInfo wqinfo = weights->info()->quantization_info();
- const QuantizationInfo oqinfo = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info();
- const UniformQuantizationInfo uiqinfo = iqinfo.uniform();
- const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
- const DataType data_type = input->info()->data_type();
-
- input->info()->set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset));
- if(!is_data_type_quantized_per_channel(weights->info()->data_type()))
- {
- const UniformQuantizationInfo uwqinfo = wqinfo.uniform();
- weights->info()->set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset));
- }
-
- // Merge activation with output stage
- PixelValue type_min{};
- PixelValue type_max{};
- std::tie(type_min, type_max) = get_min_max(data_type);
- int32_t min_activation = type_min.get<int32_t>();
- int32_t max_activation = type_max.get<int32_t>();
-
- if(supported_acts.count(act_info.activation()) != 0)
- {
- std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
- }
-
- GEMMLowpOutputStageInfo output_info;
- output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
- output_info.gemmlowp_offset = uoqinfo.offset;
- output_info.gemmlowp_min_bound = min_activation;
- output_info.gemmlowp_max_bound = max_activation;
- output_info.is_quantized_per_channel = (weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL);
- quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info);
-
- _mm_gemmlowp.configure(input, weights, biases, output, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, false, act_info));
-
- // Revert back QuantizatioInfo as input and weights could be used in other convolution layers
- input->info()->set_quantization_info(iqinfo);
- weights->info()->set_quantization_info(wqinfo);
- }
- else
- {
- // Configure matrix multiply function
- _mm_gemm.configure(input, weights, biases, output, 1.0f, 0.0f, gemm_info);
- }
-}
-
-Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
-{
- const DataType data_type = input->data_type();
- const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
- const bool is_activation_enabled = act_info.enabled();
-
- // Create GEMMInfo structure
- const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
- gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
- false, GEMMLowpOutputStageInfo(), false, false, act_info);
-
- if(is_quantized)
- {
- // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
- // Extract and negate input and weights offset
- const QuantizationInfo &iqinfo = input->quantization_info();
- const QuantizationInfo &wqinfo = weights->quantization_info();
- const QuantizationInfo &oqinfo = (output->total_size() == 0) ? iqinfo : output->quantization_info();
- const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
-
- // Merge activation with output stage
- PixelValue type_min{};
- PixelValue type_max{};
- std::tie(type_min, type_max) = get_min_max(data_type);
- int32_t min_activation = type_min.get<int32_t>();
- int32_t max_activation = type_max.get<int32_t>();
-
- const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
- };
- if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
- {
- std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
- }
-
- GEMMLowpOutputStageInfo output_info;
- output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
- output_info.gemmlowp_offset = uoqinfo.offset;
- output_info.gemmlowp_min_bound = min_activation;
- output_info.gemmlowp_max_bound = max_activation;
- output_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info));
-
- // Perform validation step on GEMMLowp
- std::unique_ptr<ITensorInfo> input_qa = input->clone();
- std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
- input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset));
- weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset));
- return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, false, act_info));
- }
- else
- {
- // Perform validation step on Matrix multiply function
- return NEGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
- }
-}
-
-Status NEGEMMConvolutionLayer::validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
-{
- const DataType data_type = input_info->data_type();
- const unsigned int mult_y = skip_im2col ? 1U : gemm_3d_depth;
- const unsigned int mult_z = skip_im2col ? gemm_3d_depth : 1U;
-
- // Set dummy tensor shapes for the validation
- const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info());
- const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info());
- const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info());
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, gemm_3d_depth, skip_im2col);
+ _impl->weights = weights;
+ _impl->op = std::make_unique<cpu::CpuGemmConv2d>();
+ _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(),
+ conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+
+ _impl->run_pack = {{TensorType::ACL_SRC_0, input},
+ {TensorType::ACL_SRC_1, weights},
+ {TensorType::ACL_SRC_2, biases},
+ {TensorType::ACL_DST, output}};
+ _impl->aux_mem_req = _impl->op->workspace();
+ _impl->workspace_tensors =
+ manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
}
-void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_UNUSED(num_groups, weights_info);
- ARM_COMPUTE_ERROR_THROW_ON(NEGEMMConvolutionLayer::validate(input->info(),
- weights->info(),
- biases != nullptr ? biases->info() : nullptr,
- output->info(),
- conv_info,
- weights_info,
- dilation,
- act_info,
- num_groups));
-
- const DataType data_type = input->info()->data_type();
- const DataLayout data_layout = input->info()->data_layout();
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-
- const unsigned int kernel_width = weights->info()->dimension(idx_width);
- const unsigned int kernel_height = weights->info()->dimension(idx_height);
-
- _is_prepared = weights_info.retain_internal_weights();
- _original_weights = weights;
- _original_output = output;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- _data_layout = data_layout;
- _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-
- const ITensor *gemm_input_to_use = input;
- ITensor *gemm_output_to_use = output;
-
- // Get convolved dimensions
- unsigned int conv_w = 0;
- unsigned int conv_h = 0;
- std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(idx_width),
- input->info()->dimension(idx_height),
- kernel_width,
- kernel_height,
- conv_info,
- dilation);
-
- // Check if GEMM3D is supported
- if(data_layout == DataLayout::NHWC)
- {
- _skip_col2im = bool(validate_gemm3d(input->info(), weights->info(), act_info, conv_h, true));
- // If not supported, we need to perform im2col and col2im (or reshape layer)
- if(!_skip_col2im)
- {
- _skip_im2col = false;
- }
- }
- else
- {
- _skip_col2im = false;
- }
-
- // Get parameters from conv_info
- unsigned int stride_x = 0;
- unsigned int stride_y = 0;
- std::tie(stride_x, stride_y) = conv_info.stride();
-
- unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels);
-
- // _weights_reshaped will be auto configured in the kernel.
- // Just append biases and do not transpose 1xW as it will be reshaped in NEGEMM
- const ITensor *weights_to_use = weights;
-
- if(_weights_manager && _weights_manager->are_weights_managed(weights))
- {
- _reshape_weights_managed.configure(weights, nullptr);
- weights_to_use = _weights_manager->acquire(weights, &_reshape_weights_managed);
- }
- else
- {
- _reshape_weights.configure(weights, nullptr, &_weights_reshaped);
- weights_to_use = &_weights_reshaped;
- }
-
- // Create tensor to store im2col reshaped inputs
- if(!_skip_im2col)
- {
- _memory_group.manage(&_im2col_output);
-
- // Configure
- _im2col_kernel = std::make_unique<NEIm2ColKernel>();
- _im2col_kernel->configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation);
-
- // Update GEMM input
- gemm_input_to_use = &_im2col_output;
- }
-
- // Create temporary GEMM output tensor in case we cannot skip col2im
- const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
- if(!_skip_col2im)
- {
- TensorShape shape_gemm;
-
- // Calculate GEMM output shape
- shape_gemm = _im2col_output.info()->tensor_shape();
- shape_gemm.set(0, mat_weights_cols);
- shape_gemm.set(1, conv_w * conv_h);
-
- // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
- TensorInfo info_gemm(shape_gemm, 1, output_data_type);
- info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout());
- _gemm_output.allocator()->init(info_gemm);
- _gemm_output_3d.allocator()->init(info_gemm);
- _memory_group.manage(&_gemm_output);
-
- // Update GEMM output
- gemm_output_to_use = &_gemm_output;
- }
- else
- {
- TensorInfo out_info{ *output->info() };
- out_info.set_data_type(output_data_type).set_data_layout(input->info()->data_layout()).set_is_resizable(true);
- _gemm_output.allocator()->init(out_info);
- _gemm_output_3d.allocator()->init(out_info);
- _memory_group.manage(&_gemm_output);
-
- // Update GEMM output
- gemm_output_to_use = &_gemm_output_3d;
- }
-
- // Configure GEMM
- // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
- const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
- configure_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, gemm_3d_depth);
-
- if(!_skip_im2col)
- {
- _im2col_output.allocator()->allocate();
- }
-
- if(!_skip_col2im)
- {
- if(_data_layout == DataLayout::NCHW)
- {
- // Configure col2im
- _col2im_kernel = std::make_unique<NECol2ImKernel>();
- _col2im_kernel->configure(gemm_output_to_use, output, Size2D(conv_w, conv_h));
- }
- else
- {
- // Configure reshape layer
- _reshape_layer.configure(gemm_output_to_use, output);
- }
- }
- else
- {
- // Configure reshape layer
- _reshape_layer.configure(gemm_output_to_use, output);
- }
-
- if(_is_quantized && !_skip_col2im)
- {
- _tmp_output.allocator()->allocate();
- }
-
- _gemm_output.allocator()->allocate();
-
- ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h),
- "Output shape does not match the expected one");
+ return cpu::CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+ enable_fast_math, num_groups);
}
-Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+Status NEGEMMConvolutionLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+ const ITensorInfo *src,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *dst,
+ const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ const bool enable_fast_math)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported");
-
- const DataLayout data_layout = input->data_layout();
- const DataType data_type = input->data_type();
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-
- const unsigned int kernel_width = weights->dimension(idx_width);
- const unsigned int kernel_height = weights->dimension(idx_height);
-
- TensorInfo im2col_reshaped_info{};
- TensorInfo info_gemm{};
- TensorInfo tmp_info{};
- TensorInfo weights_reshaped_info{};
- const ITensorInfo *gemm_input_to_use = input;
- const ITensorInfo *gemm_output_to_use = output;
- const ITensorInfo *weights_to_use = weights;
-
- const bool append_bias = false;
- const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
- const bool is_bf16 = data_type == DataType::BFLOAT16;
- bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-
- // Get convolved dimensions
- unsigned int conv_w = 0;
- unsigned int conv_h = 0;
-
- std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(idx_width),
- input->dimension(idx_height),
- kernel_width,
- kernel_height,
- conv_info,
- dilation);
-
- // Check if GEMM3D is supported
- bool skip_col2im = false;
- if(data_layout == DataLayout::NHWC)
- {
- skip_col2im = bool(validate_gemm3d(input, weights, act_info, conv_h, true));
- // If not supported, we need to perform im2col and col2im (or reshape layer)
- if(!skip_col2im)
- {
- skip_im2col = false;
- }
- }
-
- if(skip_col2im)
- {
- // If not supported, we need to perform im2col and col2im (or reshape layer)
- if(!bool(validate_gemm3d(input, weights, act_info, conv_h, skip_im2col)))
- {
- skip_im2col = false;
- skip_col2im = false;
- }
- }
-
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != input->dimension(idx_channel));
- ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
- // Validate biases
- if(biases != nullptr)
- {
- if(is_quantized)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
- }
- else if(is_bf16)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- }
- ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- }
-
- unsigned int mat_weights_cols = weights->dimension(idx_kernels);
- unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
-
- // Output tensor auto inizialization if not yet initialized
- ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, nullptr, nullptr));
- weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, data_type);
- weights_reshaped_info.set_quantization_info(weights->quantization_info());
- weights_to_use = &weights_reshaped_info;
-
- if(!skip_im2col)
- {
- // Create tensor info for im2col reshaped inputs
- // For CPU, the batch size is on the fourth dimension
- TensorShape shape_im2col = input->tensor_shape();
- shape_im2col.set(0, mat_weights_rows);
- shape_im2col.set(1, conv_w * conv_h);
- shape_im2col.set(2, 1);
-
- im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type);
- im2col_reshaped_info.set_quantization_info(input->quantization_info());
- ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
- gemm_input_to_use = &im2col_reshaped_info;
- }
-
- // Create temporary GEMM output tensor in case we cannot skip col2im
- const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
- if(!skip_col2im)
- {
- TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
- shape_gemm.set(0, mat_weights_cols);
- shape_gemm.set(1, conv_w * conv_h);
- info_gemm = TensorInfo(shape_gemm, 1, output_data_type);
- }
- else
- {
- info_gemm = TensorInfo(output->tensor_shape(), 1, output_data_type);
- }
- info_gemm.set_quantization_info(output->quantization_info()).set_data_layout(input->data_layout());
- gemm_output_to_use = &info_gemm;
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, skip_col2im ? conv_h : 0, skip_im2col));
-
- // Validate Col2Im/ReshapeLayer
- if(!skip_col2im && (data_layout == DataLayout::NCHW))
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NECol2ImKernel::validate(gemm_output_to_use, output, Size2D(conv_w, conv_h)));
- }
-
- return Status{};
+ return cpu::CpuGemmConv2d::has_opt_impl(expected_weight_format, src, weights, biases, dst, conv_info, weights_info,
+ dilation, act_info, enable_fast_math);
}
void NEGEMMConvolutionLayer::run()
{
prepare();
-
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- bool out_has_padding = _skip_col2im && (_original_output->info()->padding().bottom != 0 || _original_output->info()->padding().top != 0);
-
- if(!_skip_im2col)
- {
- // Run input reshaping
- unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
- NEScheduler::get().schedule(_im2col_kernel.get(), y_dim);
- }
-
- // Handle the case where output has top/bottom padding
- const ITensor *out_to_use = out_has_padding ? &_gemm_output : _original_output;
- _gemm_output_3d.info()->extend_padding(out_to_use->info()->padding());
- _gemm_output_3d.allocator()->import_memory(out_to_use->buffer());
-
- // Runs NEGEMM or NEGEMMLowpMatrixMultiplyCore functions
- if(_is_quantized)
- {
- // Run gemmlowp
- _mm_gemmlowp.run();
- }
- else
- {
- // Run gemm
- _mm_gemm.run();
- }
-
- // Reshape output matrix
- if(!_skip_col2im)
- {
- if(_data_layout == DataLayout::NCHW)
- {
- NEScheduler::get().schedule(_col2im_kernel.get(), Window::DimY);
- }
- else
- {
- _reshape_layer.run();
- }
- }
- else if(out_has_padding)
- {
- _reshape_layer.run();
- }
-
- _gemm_output_3d.allocator()->free();
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
+ _impl->op->run(_impl->run_pack);
}
void NEGEMMConvolutionLayer::prepare()
{
- if(!_is_prepared)
+ if (!_impl->is_prepared)
{
- if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
- {
- _weights_manager->run(_original_weights, &_reshape_weights_managed);
- }
- else
- {
- // Run weights reshaping and mark original weights tensor as unused
- _weights_reshaped.allocator()->allocate();
- _reshape_weights.run();
- _original_weights->mark_as_unused();
- }
-
- // Prepare GEMM
- _is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare();
- if(!_weights_reshaped.is_used())
- {
- _weights_reshaped.allocator()->free();
- }
+ _impl->op->prepare(_impl->run_pack);
- _is_prepared = true;
+ // Release temporary tensors that are only used in prepare stage
+ release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace_tensors);
+ _impl->is_prepared = true;
}
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index cc0f20e695..44bfc6a51e 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,604 +23,109 @@
*/
#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/IWeightsManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "src/core/helpers/AutoConfiguration.h"
+#include "arm_compute/runtime/Tensor.h"
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
+
+using namespace arm_compute::experimental;
namespace arm_compute
{
-namespace
+struct NEGEMMLowpMatrixMultiplyCore::Impl
{
-cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
+ const ITensor *b{nullptr};
+ std::unique_ptr<cpu::CpuGemmLowpMatrixMultiplyCore> op{nullptr};
+ ITensorPack run_pack{};
+ ITensorPack prep_pack{};
+ MemoryGroup memory_group{};
+ IWeightsManager *weights_manager{nullptr};
+ MemoryRequirements aux_mem_req{};
+ WorkspaceData<Tensor> workspace_tensors{};
+ bool is_prepared{false};
+};
+
+NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager,
+ IWeightsManager *weights_manager)
+ : _impl(std::make_unique<Impl>())
{
- cpu::AsmGemmInfo asm_info;
- asm_info.method = cpu::AsmConvMethod::Im2Col;
- asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
- asm_info.depth_output_gemm3d = info.depth_output_gemm3d();
- asm_info.activation_info = info.activation_info();
- asm_info.output_stage = info.gemmlowp_output_stage();
-
- return asm_info;
+ _impl->weights_manager = weights_manager;
+ _impl->memory_group = MemoryGroup(memory_manager);
}
-} // namespace
-
-using namespace arm_compute::misc::shape_calculator;
-
NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default;
-NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
- : _memory_group(memory_manager), _weights_manager(weights_manager), _asm_glue(std::make_unique<cpu::CpuGemmAssemblyDispatch>(memory_manager, weights_manager)), _mm_kernel(), _mtx_a_reshape_kernel(),
- _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(),
- _convert_to_signed_asymm(), _convert_from_signed_asymm(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0),
- _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false), _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false),
- _run_activation(false), _flip_signedness(false)
-{
-}
-
-void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)
+void NEGEMMLowpMatrixMultiplyCore::configure(
+ const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
- ARM_COMPUTE_UNUSED(c);
- ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
-
- const ITensor *matrix_a = a;
- const ITensor *matrix_b = b;
- GEMMInfo info = gemm_info;
-
- // Set internal variables
- _a_offset = a->info()->quantization_info().uniform().offset;
- _b_offset = b->info()->quantization_info().uniform().offset;
- _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
- _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
- _is_prepared = false;
- _fused_assembly_path = false;
- _flip_signedness = is_data_type_quantized_per_channel(b->info()->data_type()) && (a->info()->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
- _original_b = b;
-
- const ITensor *a_to_use = a;
-
- // Convert to QASYMM8 -> QASYMM8_SIGNED and back
- if(_flip_signedness)
- {
- const int32_t offset_correction = 128;
- const DataType dt = DataType::QASYMM8_SIGNED;
- const UniformQuantizationInfo iqinfo = a_to_use->info()->quantization_info().uniform();
-
- _signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)));
- _memory_group.manage(&_signed_a);
- _convert_to_signed_asymm = std::make_unique<NEConvertQuantizedSignednessKernel>();
- _convert_to_signed_asymm->configure(a_to_use, &_signed_a);
- a_to_use = &_signed_a;
- _a_offset = _signed_a.info()->quantization_info().uniform().offset;
-
- const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform();
- _memory_group.manage(&_signed_output);
- _signed_output.allocator()->init(output->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)));
-
- // Output stage correction
- GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
- output_stage_corr.gemmlowp_offset = _signed_output.info()->quantization_info().uniform().offset;
- output_stage_corr.gemmlowp_min_bound -= offset_correction;
- output_stage_corr.gemmlowp_max_bound -= offset_correction;
- info.set_gemmlowp_output_stage(output_stage_corr);
-
- // Update matrix a
- matrix_a = &_signed_a;
- }
-
- // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
- if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
- {
- _fuse_output_stage = true;
- _memory_group.manage(&_mm_result_s32);
- TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
- _mm_result_s32.allocator()->init(info_mm_result_s32);
- }
-
- // Initialize assembly kernel meta-data
- const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
-#ifdef __aarch64__
- switch(a->info()->data_type())
- {
- case DataType::QASYMM8:
- case DataType::QASYMM8_SIGNED:
- case DataType::U8:
- case DataType::S8:
- {
- if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
- {
- auto c_info_to_use = c == nullptr ? nullptr : c->info();
- _asm_glue->configure(a_to_use->info(), b->info(), c_info_to_use, output->info(), asm_info);
- _fused_assembly_path = _asm_glue->is_configured();
- _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_2, c);
- _asm_glue_tensors.add_tensor(TensorType::ACL_DST, output);
- }
- else
- {
- auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 : output);
- _asm_glue->configure(a_to_use->info(), b->info(), nullptr, output_to_use->info(), asm_info);
- _asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use);
- }
- _assembly_path = _asm_glue->is_configured();
- _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
- _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Datatype not supported");
- break;
- }
- }
-#endif /* __aarch64__ */
- if(!(_assembly_path || _run_vector_matrix_multiplication))
- {
- matrix_a = &_tmp_a;
- matrix_b = &_tmp_b;
-
- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, a_to_use->info()->data_type(), a_to_use->info()->quantization_info());
- // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
- TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info());
- _tmp_a.allocator()->init(a_info);
- _tmp_b.allocator()->init(b_info);
- _memory_group.manage(&_tmp_a);
- if(!_reshape_b_only_on_first_run)
- {
- _memory_group.manage(&_tmp_b);
- }
-
- // Configure interleave kernel
- _mtx_a_reshape_kernel = std::make_unique<NEGEMMInterleave4x4Kernel>();
- _mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a);
-
- // Configure transpose kernel
- _mtx_b_reshape_kernel = std::make_unique<NEGEMMTranspose1xWKernel>();
- _mtx_b_reshape_kernel->configure(b, &_tmp_b);
- }
-
- if(!_fused_assembly_path)
- {
- // Build reduction info
- const GEMMLowpReductionKernelInfo reduction_info(a_to_use->info()->dimension(0), false, 0, false);
-
- // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
- if(_a_offset != 0)
- {
- TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
-
- _vector_sum_col.allocator()->init(info_vector_sum_col);
- if(!_reshape_b_only_on_first_run)
- {
- _memory_group.manage(&_vector_sum_col);
- }
-
- // Configure Matrix B reduction kernel
- _mtx_b_reduction_kernel = std::make_unique<NEGEMMLowpMatrixBReductionKernel>();
- _mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info);
- }
-
- // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
- if(_b_offset != 0)
- {
- TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);
-
- _vector_sum_row.allocator()->init(info_vector_sum_row);
- _memory_group.manage(&_vector_sum_row);
-
- // Configure matrix A reduction kernel
- _mtx_a_reduction_kernel = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);
- }
-
- if(_fuse_output_stage)
- {
- // Configure matrix multiply kernel
- if(!_assembly_path)
- {
- _mm_kernel = std::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
- _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);
- }
-
- _offset_contribution_output_stage_kernel = std::make_unique<NEGEMMLowpOffsetContributionOutputStageKernel>();
- _offset_contribution_output_stage_kernel->configure(&_mm_result_s32,
- _a_offset == 0 ? nullptr : &_vector_sum_col,
- _b_offset == 0 ? nullptr : &_vector_sum_row, c,
- _flip_signedness ? &_signed_output : output,
- a->info()->dimension(0),
- _a_offset, _b_offset, info.gemmlowp_output_stage());
-
- if(_flip_signedness)
- {
- _convert_from_signed_asymm = std::make_unique<NEConvertQuantizedSignednessKernel>();
- _convert_from_signed_asymm->configure(&_signed_output, output);
- }
- }
- else
- {
- // Configure matrix multiply kernel
- if(!_assembly_path)
- {
- _mm_kernel = std::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
- _mm_kernel->configure(matrix_a, matrix_b, output);
- }
- // Configure offset contribution kernel
- _offset_contribution_kernel = std::make_unique<NEGEMMLowpOffsetContributionKernel>();
- _offset_contribution_kernel->configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);
- }
- }
- // Configure activation
- const ActivationLayerInfo &activation = gemm_info.activation_info();
- _run_activation = activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
- if(_run_activation)
- {
- _activation_func.configure(output, nullptr, activation);
- }
-
- // Allocate tensors
- if(!_assembly_path && !_run_vector_matrix_multiplication)
- {
- _tmp_a.allocator()->allocate();
- if(!_reshape_b_only_on_first_run)
- {
- _tmp_b.allocator()->allocate();
- }
- }
-
- if(!_fused_assembly_path)
- {
- if(_a_offset != 0 && !_reshape_b_only_on_first_run)
- {
- _vector_sum_col.allocator()->allocate();
- }
-
- if(_b_offset != 0)
- {
- _vector_sum_row.allocator()->allocate();
- }
- }
- if(_fuse_output_stage)
- {
- _mm_result_s32.allocator()->allocate();
- }
-
- if(_flip_signedness)
- {
- _signed_a.allocator()->allocate();
- _signed_output.allocator()->allocate();
- }
+ // Make the B matrix dynamic values.
+ auto b_info_to_use = b->info()->clone();
+ if (!gemm_info.reshape_b_only_on_first_run())
+ {
+ b_info_to_use->set_are_values_constant(false);
+ }
+
+ _impl->b = b;
+ _impl->op = std::make_unique<cpu::CpuGemmLowpMatrixMultiplyCore>();
+ _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr ? c->info() : nullptr), output->info(),
+ gemm_info);
+ _impl->run_pack = {{TensorType::ACL_SRC_0, a},
+ {TensorType::ACL_SRC_1, b},
+ {TensorType::ACL_SRC_2, c},
+ {TensorType::ACL_DST, output}};
+ _impl->prep_pack = {{TensorType::ACL_SRC_1, b}, {TensorType::ACL_SRC_2, c}};
+ _impl->aux_mem_req = _impl->op->workspace();
+ _impl->workspace_tensors =
+ manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
}
-Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ const ITensorInfo *output,
+ const GEMMInfo &gemm_info)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
- "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-
- GEMMInfo info = gemm_info;
- const ITensorInfo *matrix_a_info = a;
- const ITensorInfo *matrix_b_info = b;
-
- const ITensorInfo *a_to_use = a;
-
- TensorInfo tmp_a_info{};
- TensorInfo tmp_b_info{};
- TensorInfo mm_result_s32_info{};
-
- int32_t a_offset = a->quantization_info().uniform().offset;
- int32_t b_offset = b->quantization_info().uniform().offset;
-
- bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
- if(fuse_output_stage)
+ // Make the B matrix dynamic values.
+ auto b_info_to_use = b->clone();
+ if (!gemm_info.reshape_b_only_on_first_run())
{
- auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
+ b_info_to_use->set_are_values_constant(false);
}
- // Convert QASYMM8->QASYMM8_SIGNED
- TensorInfo signed_a{};
- TensorInfo signed_output{};
- bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
- if(flip_signedness)
- {
- const int32_t offset_correction = 128;
- const DataType dt = DataType::QASYMM8_SIGNED;
- const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
-
- signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
- ARM_COMPUTE_RETURN_ON_ERROR(NEConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
- a_to_use = &signed_a;
- a_offset = signed_a.quantization_info().uniform().offset;
-
- const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
- signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
-
- // Output stage correction
- GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
- output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset;
- output_stage_corr.gemmlowp_min_bound -= offset_correction;
- output_stage_corr.gemmlowp_max_bound -= offset_correction;
- info.set_gemmlowp_output_stage(output_stage_corr);
-
- // Update matrix a
- matrix_a_info = &signed_a;
- }
-
- // Initialize assembly kernel meta-data
- const cpu::AsmGemmInfo asm_info = init_assembly_metadata(info);
-
- // Check if we need to run the optimized assembly kernel
- bool run_optimised = false;
- bool run_optimised_requantized = false;
- if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
- {
- run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));
- run_optimised_requantized = run_optimised;
- }
- else
- {
- run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
- }
-
- if(run_optimised)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
- if(info.depth_output_gemm3d() != 0)
- {
- if(info.reinterpret_input_as_3d())
- {
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
- }
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
- }
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
-
- const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
- if(!run_vector_matrix_multiplication)
- {
- matrix_a_info = &tmp_a_info;
- matrix_b_info = &tmp_b_info;
-
- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- TensorShape shape_tmp_a = a->tensor_shape();
- shape_tmp_a.set(0, a->dimension(0) * 4);
- shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
-
- // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
- TensorShape shape_tmp_b = b->tensor_shape();
- shape_tmp_b.set(0, b->dimension(1) * 16);
- shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
-
- // Validate interleave kernel
- auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
- auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
- }
- }
-
- if(!run_optimised_requantized)
- {
- TensorInfo info_vector_sum_col{};
- TensorInfo info_vector_sum_row{};
-
- const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
-
- // Validate matrix B reduction kernel only if _a_offset is not equal to 0
- if(a_offset != 0)
- {
- info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
-
- // Configure Matrix B reduction kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
- }
-
- // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
- if(b_offset != 0)
- {
- info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
-
- // Configure matrix A reduction kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
- }
-
- if(fuse_output_stage)
- {
- if(!run_optimised)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
- }
-
- // Validate offset contribution kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
- a_offset == 0 ? nullptr : &info_vector_sum_col,
- b_offset == 0 ? nullptr : &info_vector_sum_row,
- c,
- flip_signedness ? &signed_output : output,
- a_offset, b_offset,
- info.gemmlowp_output_stage()));
- }
- else
- {
- if(!run_optimised)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
- }
- // Validate offset contribution kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,
- a_offset == 0 ? nullptr : &info_vector_sum_col,
- b_offset == 0 ? nullptr : &info_vector_sum_row,
- a_offset, b_offset));
- }
- }
-
- // Validate activation
- const ActivationLayerInfo &activation = gemm_info.activation_info();
- if(activation.enabled())
- {
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, activation));
- }
-
- return Status{};
+ return cpu::CpuGemmLowpMatrixMultiplyCore::validate(a, b_info_to_use.get(), c, output, gemm_info);
}
void NEGEMMLowpMatrixMultiplyCore::run()
{
prepare();
-
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- // Convert QASYMM8->QASYMM8_SIGNED
- if(_flip_signedness)
- {
- NEScheduler::get().schedule(_convert_to_signed_asymm.get(), Window::DimY);
- }
-
- // Run GEMM
- if(_asm_glue->is_configured())
- {
- _asm_glue->run(_asm_glue_tensors);
- }
- else
- {
- if(!_run_vector_matrix_multiplication)
- {
- // Run interleave kernel
- NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
-
- if(!_reshape_b_only_on_first_run)
- {
- // Run transpose kernel
- NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
- }
- }
- NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
- }
-
- if(!_fused_assembly_path)
- {
- // Run matrix A reduction kernel only if _b_offset is not equal to 0
- if(_b_offset != 0)
- {
- NEScheduler::get().schedule(_mtx_a_reduction_kernel.get(), Window::DimX);
- }
-
- // Run matrix B reduction kernel only if _a_offset is not equal to 0
- if(_a_offset != 0 && !_reshape_b_only_on_first_run)
- {
- NEScheduler::get().schedule(_mtx_b_reduction_kernel.get(), Window::DimX);
- }
-
- if(_fuse_output_stage)
- {
- // Run offset contribution kernel
- NEScheduler::get().schedule(_offset_contribution_output_stage_kernel.get(), Window::DimY);
- }
- else
- {
- // Run offset contribution kernel
- NEScheduler::get().schedule(_offset_contribution_kernel.get(), Window::DimY);
- }
- }
-
- // Convert QASYMM8_SIGNED->QASYMM8
- if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
- {
- NEScheduler::get().schedule(_convert_from_signed_asymm.get(), Window::DimY);
- }
-
- // Run fused activation unless already run in the fused assembly
- if(_run_activation)
- {
- _activation_func.run();
- }
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
+ _impl->op->run(_impl->run_pack);
}
void NEGEMMLowpMatrixMultiplyCore::prepare()
{
- if(!_is_prepared)
+ if (!_impl->is_prepared)
{
- const bool original_b_managed_by_weights_manager = _weights_manager && _weights_manager->are_weights_managed(_original_b);
- // Run assembly reshape
- if(_asm_glue->is_configured())
- {
- if(!original_b_managed_by_weights_manager)
- {
- ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
- }
+ _impl->op->prepare(_impl->prep_pack);
- _asm_glue->prepare(_asm_glue_tensors);
- if(!original_b_managed_by_weights_manager)
- {
- _original_b->mark_as_unused();
- }
- }
- // Run non-assembly reshape
- else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
- {
- if(!original_b_managed_by_weights_manager)
- {
- ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
- }
-
- // Run reshape kernel and mark original weights tensor as unused
- _tmp_b.allocator()->allocate();
- NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
- if(!original_b_managed_by_weights_manager)
- {
- _original_b->mark_as_unused();
- }
- }
+ auto has_reshape =
+ std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+ [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
- // Run matrix B reduction kernel only if _a_offset is not equal to 0
- if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
+ if (has_reshape != std::end(_impl->aux_mem_req))
{
- _vector_sum_col.allocator()->allocate();
- NEScheduler::get().schedule(_mtx_b_reduction_kernel.get(), Window::DimX);
+ _impl->b->mark_as_unused();
}
- _is_prepared = true;
+ // Release temporary tensors that are only used in prepare stage
+ release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace_tensors);
+ _impl->is_prepared = true;
}
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
index 807785a534..8178003b5e 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,162 +25,54 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
-namespace arm_compute
-{
-NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::~NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint() = default;
-
-void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,
- int result_offset_after_shift, int min, int max)
-{
- auto k = std::make_unique<NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
- k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
- _kernel = std::move(k);
-}
-
-Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
-{
- return NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
-}
-
-NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::~NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint() = default;
-
-void NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,
- int result_offset_after_shift, int min, int max)
-{
- auto k = std::make_unique<NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
- k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
- _kernel = std::move(k);
-}
+#include "src/cpu/operators/CpuGemmLowpOutputStage.h"
-Status NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+namespace arm_compute
{
- return NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
-}
-
-NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::~NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint() = default;
-
-void NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int min, int max)
+struct NEGEMMLowpOutputStage::Impl
{
- auto k = std::make_unique<NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
- k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, min, max);
- _kernel = std::move(k);
-}
-
-Status NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+ const ITensor *src{nullptr};
+ const ITensor *bias{nullptr};
+ ITensor *dst{nullptr};
+ ITensorPack run_pack{};
+ std::unique_ptr<cpu::CpuGemmLowpOutputStage> op{nullptr};
+};
+
+NEGEMMLowpOutputStage::NEGEMMLowpOutputStage() : _impl(std::make_unique<Impl>())
{
- return NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, min, max);
}
-
NEGEMMLowpOutputStage::~NEGEMMLowpOutputStage() = default;
-void NEGEMMLowpOutputStage::configure(const ITensor *input, const ITensor *bias, ITensor *output, const GEMMLowpOutputStageInfo &info)
+void NEGEMMLowpOutputStage::configure(const ITensor *input,
+ const ITensor *bias,
+ ITensor *output,
+ const GEMMLowpOutputStageInfo &info)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpOutputStage::validate(input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info));
-
- switch(info.type)
- {
- case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
- {
- switch(info.output_data_type)
- {
- case DataType::QASYMM8:
- {
- auto k = std::make_unique<NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
- k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
- _kernel = std::move(k);
- break;
- }
- case DataType::QASYMM8_SIGNED:
- {
- auto k = std::make_unique<NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
- k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
- _kernel = std::move(k);
- break;
- }
- case DataType::QSYMM16:
- {
- auto k = std::make_unique<NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
- k->configure(input, bias, output, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
- _kernel = std::move(k);
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Unsupported output data type.");
- break;
- }
- }
- break;
- }
- case GEMMLowpOutputStageType::QUANTIZE_DOWN:
- {
- switch(info.output_data_type)
- {
- case DataType::QASYMM8:
- case DataType::QASYMM8_SIGNED:
- {
- auto k = std::make_unique<NEGEMMLowpQuantizeDownInt32ScaleKernel>();
- k->configure(input, bias, output, &info);
- _kernel = std::move(k);
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Unsupported output data type.");
- break;
- }
- }
- break;
- }
- default:
- ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type.");
- }
+ ARM_COMPUTE_ERROR_THROW_ON(
+ NEGEMMLowpOutputStage::validate(input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info));
+ _impl->src = input;
+ _impl->bias = bias;
+ _impl->dst = output;
+ _impl->op = std::make_unique<cpu::CpuGemmLowpOutputStage>();
+ _impl->op->configure(input->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info);
+
+ _impl->run_pack = {
+ {TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_BIAS, _impl->bias}, {TensorType::ACL_DST, _impl->dst}};
}
-Status NEGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info)
+Status NEGEMMLowpOutputStage::validate(const ITensorInfo *input,
+ const ITensorInfo *bias,
+ const ITensorInfo *output,
+ const GEMMLowpOutputStageInfo &info)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::UNKNOWN, "NEGEMMLowpQuantizeDownScaleByFixedPoint cannot be used with UNKNOWN output data type.");
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16);
-
- ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) && (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT));
+ return cpu::CpuGemmLowpOutputStage::validate(input, bias, output, info);
+}
- switch(info.type)
- {
- case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
- {
- switch(output->data_type())
- {
- case DataType::QASYMM8:
- return NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
- case DataType::QASYMM8_SIGNED:
- return NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
- case DataType::QSYMM16:
- return NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(input, bias, output, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
- default:
- return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type.");
- }
- }
- case GEMMLowpOutputStageType::QUANTIZE_DOWN:
- {
- switch(output->data_type())
- {
- case DataType::QASYMM8:
- case DataType::QASYMM8_SIGNED:
- return NEGEMMLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info);
- default:
- return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type.");
- }
- }
- default:
- return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type.");
- }
+void NEGEMMLowpOutputStage::run()
+{
+ _impl->op->run(_impl->run_pack);
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGather.cpp b/src/runtime/NEON/functions/NEGather.cpp
index 86cbfd187a..62b8cfa48b 100644
--- a/src/runtime/NEON/functions/NEGather.cpp
+++ b/src/runtime/NEON/functions/NEGather.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/NEON/functions/NEGather.h"
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEGatherKernel.h"
#include <utility>
@@ -31,6 +32,7 @@ namespace arm_compute
{
void NEGather::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
{
+ ARM_COMPUTE_LOG_PARAMS(input, indices, output, axis);
auto k = std::make_unique<NEGatherKernel>();
k->configure(input, indices, output, axis);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
index 931fdb22f7..1022b4153e 100644
--- a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
+++ b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
@@ -25,10 +25,12 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
#include "src/core/NEON/kernels/NEFillBorderKernel.h"
#include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
#include "src/core/NEON/kernels/NEPadLayerKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
namespace arm_compute
{
@@ -67,41 +69,55 @@ NEGenerateProposalsLayer::NEGenerateProposalsLayer(std::shared_ptr<IMemoryManage
NEGenerateProposalsLayer::~NEGenerateProposalsLayer() = default;
-void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *deltas, const ITensor *anchors, ITensor *proposals, ITensor *scores_out, ITensor *num_valid_proposals,
+void NEGenerateProposalsLayer::configure(const ITensor *scores,
+ const ITensor *deltas,
+ const ITensor *anchors,
+ ITensor *proposals,
+ ITensor *scores_out,
+ ITensor *num_valid_proposals,
const GenerateProposalsInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
- ARM_COMPUTE_ERROR_THROW_ON(NEGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
+ ARM_COMPUTE_ERROR_THROW_ON(NEGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(),
+ proposals->info(), scores_out->info(),
+ num_valid_proposals->info(), info));
+ ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
_is_nhwc = scores->info()->data_layout() == DataLayout::NHWC;
const DataType scores_data_type = scores->info()->data_type();
_is_qasymm8 = scores_data_type == DataType::QASYMM8;
- const int num_anchors = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
- const int feat_width = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
- const int feat_height = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
- const int total_num_anchors = num_anchors * feat_width * feat_height;
- const int pre_nms_topN = info.pre_nms_topN();
- const int post_nms_topN = info.post_nms_topN();
- const size_t values_per_roi = info.values_per_roi();
+ const int num_anchors = scores->info()->dimension(
+ get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+ const int feat_width = scores->info()->dimension(
+ get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+ const int feat_height = scores->info()->dimension(
+ get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
+ const int total_num_anchors = num_anchors * feat_width * feat_height;
+ const int pre_nms_topN = info.pre_nms_topN();
+ const int post_nms_topN = info.post_nms_topN();
+ const size_t values_per_roi = info.values_per_roi();
const QuantizationInfo scores_qinfo = scores->info()->quantization_info();
const DataType rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type;
- const QuantizationInfo rois_qinfo = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
+ const QuantizationInfo rois_qinfo =
+ (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
// Compute all the anchors
_memory_group.manage(&_all_anchors);
_compute_anchors = std::make_unique<NEComputeAllAnchorsKernel>();
- _compute_anchors->configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+ _compute_anchors->configure(anchors, &_all_anchors,
+ ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
- _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
+ _deltas_flattened.allocator()->init(
+ TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
// Permute and reshape deltas
_memory_group.manage(&_deltas_flattened);
- if(!_is_nhwc)
+ if (!_is_nhwc)
{
_memory_group.manage(&_deltas_permuted);
- _permute_deltas.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+ _permute_deltas.configure(deltas, &_deltas_permuted, PermutationVector{2, 0, 1});
_flatten_deltas.configure(&_deltas_permuted, &_deltas_flattened);
_deltas_permuted.allocator()->allocate();
}
@@ -115,10 +131,10 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
// Permute and reshape scores
_memory_group.manage(&_scores_flattened);
- if(!_is_nhwc)
+ if (!_is_nhwc)
{
_memory_group.manage(&_scores_permuted);
- _permute_scores.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+ _permute_scores.configure(scores, &_scores_permuted, PermutationVector{2, 0, 1});
_flatten_scores.configure(&_scores_permuted, &_scores_flattened);
_scores_permuted.allocator()->allocate();
}
@@ -129,7 +145,7 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
Tensor *anchors_to_use = &_all_anchors;
Tensor *deltas_to_use = &_deltas_flattened;
- if(_is_qasymm8)
+ if (_is_qasymm8)
{
_all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32));
_deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32));
@@ -152,11 +168,12 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
anchors_to_use->allocator()->allocate();
_all_proposals_to_use = &_all_proposals;
- if(_is_qasymm8)
+ if (_is_qasymm8)
{
_memory_group.manage(&_all_proposals_quantized);
// Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset
- _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
+ _all_proposals_quantized.allocator()->init(
+ TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
_quantize_all_proposals.configure(&_all_proposals, &_all_proposals_quantized);
_all_proposals.allocator()->allocate();
_all_proposals_to_use = &_all_proposals_quantized;
@@ -172,7 +189,8 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
// Note that NMS needs outputs preinitialized.
auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo);
- auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo);
+ auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type,
+ rois_qinfo);
auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32);
// Initialize temporaries (unused) outputs
@@ -185,17 +203,12 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
_memory_group.manage(&_proposals_4_roi_values);
- const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height());
- _cpp_nms.configure(&_scores_flattened /*scores_in*/,
- _all_proposals_to_use /*boxes_in,*/,
- nullptr /* batch_splits_in*/,
- scores_out /* scores_out*/,
- &_proposals_4_roi_values /*boxes_out*/,
- &_classes_nms_unused /*classes*/,
- nullptr /*batch_splits_out*/,
- &_keeps_nms_unused /*keeps*/,
- num_valid_proposals /* keeps_size*/,
- box_nms_info);
+ const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f,
+ true, min_size_scaled, info.im_width(), info.im_height());
+ _cpp_nms.configure(&_scores_flattened /*scores_in*/, _all_proposals_to_use /*boxes_in,*/,
+ nullptr /* batch_splits_in*/, scores_out /* scores_out*/, &_proposals_4_roi_values /*boxes_out*/,
+ &_classes_nms_unused /*classes*/, nullptr /*batch_splits_out*/, &_keeps_nms_unused /*keeps*/,
+ num_valid_proposals /* keeps_size*/, box_nms_info);
_keeps_nms_unused.allocator()->allocate();
_classes_nms_unused.allocator()->allocate();
@@ -203,12 +216,17 @@ void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *d
_scores_flattened.allocator()->allocate();
// Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
- _pad.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+ _pad.configure(&_proposals_4_roi_values, proposals, PaddingList{{1, 0}});
_proposals_4_roi_values.allocator()->allocate();
}
-Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
- const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
+Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores,
+ const ITensorInfo *deltas,
+ const ITensorInfo *anchors,
+ const ITensorInfo *proposals,
+ const ITensorInfo *scores_out,
+ const ITensorInfo *num_valid_proposals,
+ const GenerateProposalsInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -216,9 +234,12 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas);
- const int num_anchors = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
- const int feat_width = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
- const int feat_height = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
+ const int num_anchors =
+ scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
+ const int feat_width =
+ scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
+ const int feat_height =
+ scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
const int num_images = scores->dimension(3);
const int total_num_anchors = num_anchors * feat_width * feat_height;
const int values_per_roi = info.values_per_roi();
@@ -227,76 +248,100 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1);
- if(is_qasymm8)
+ if (is_qasymm8)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16);
const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform();
ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f);
}
- TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
- ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
-
- TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
- TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
- if(scores->data_layout() == DataLayout::NHWC)
+ TensorInfo all_anchors_info(
+ anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate(
+ anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
+
+ TensorInfo deltas_permuted_info =
+ deltas->clone()
+ ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height))
+ .set_is_resizable(true);
+ TensorInfo scores_permuted_info =
+ scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+ if (scores->data_layout() == DataLayout::NHWC)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info);
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1}));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1}));
}
- TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+ TensorInfo deltas_flattened_info(
+ deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info));
- TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
- TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+ TensorInfo scores_flattened_info(
+ scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
+ TensorInfo proposals_4_roi_values(
+ deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info));
TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values;
- TensorInfo proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
- proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0));
- if(is_qasymm8)
+ TensorInfo proposals_4_roi_values_quantized(
+ deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+ proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16)
+ .set_quantization_info(QuantizationInfo(0.125f, 0));
+ if (is_qasymm8)
{
- TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+ TensorInfo all_anchors_f32_info(anchors->clone()
+ ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+ .set_is_resizable(true)
+ .set_data_type(DataType::F32));
ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info));
- TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
- ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
-
- TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
- ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
- BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
+ TensorInfo deltas_flattened_f32_info(deltas->clone()
+ ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+ .set_is_resizable(true)
+ .set_data_type(DataType::F32));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
+
+ TensorInfo proposals_4_roi_values_f32(deltas->clone()
+ ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+ .set_is_resizable(true)
+ .set_data_type(DataType::F32));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(
+ &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
+ BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized;
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
- BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEBoundingBoxTransform::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
+ BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
}
- ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayer::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } }));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayer::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}}));
- if(num_valid_proposals->total_size() > 0)
+ if (num_valid_proposals->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32);
}
- if(proposals->total_size() > 0)
+ if (proposals->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1);
ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors));
- if(is_qasymm8)
+ if (is_qasymm8)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16);
const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform();
@@ -309,7 +354,7 @@ Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITens
}
}
- if(scores_out->total_size() > 0)
+ if (scores_out->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors));
@@ -328,7 +373,7 @@ void NEGenerateProposalsLayer::run()
NEScheduler::get().schedule(_compute_anchors.get(), Window::DimY);
// Transpose and reshape the inputs
- if(!_is_nhwc)
+ if (!_is_nhwc)
{
_permute_deltas.run();
_permute_scores.run();
@@ -337,7 +382,7 @@ void NEGenerateProposalsLayer::run()
_flatten_deltas.run();
_flatten_scores.run();
- if(_is_qasymm8)
+ if (_is_qasymm8)
{
_dequantize_anchors.run();
_dequantize_deltas.run();
@@ -346,7 +391,7 @@ void NEGenerateProposalsLayer::run()
// Build the boxes
_bounding_box.run();
- if(_is_qasymm8)
+ if (_is_qasymm8)
{
_quantize_all_proposals.run();
}
diff --git a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
index 5965b9722f..78218cbdee 100644
--- a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
namespace arm_compute
@@ -33,21 +35,29 @@ namespace arm_compute
NEInstanceNormalizationLayer::~NEInstanceNormalizationLayer() = default;
NEInstanceNormalizationLayer::NEInstanceNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
+ : _memory_group(std::move(memory_manager)),
+ _normalization_kernel(),
+ _is_nchw(false),
+ _permute_input(),
+ _permute_output(),
+ _permuted_input(),
+ _permuted_output()
{
}
void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, float gamma, float beta, float epsilon)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon);
+
const DataLayout data_layout = input->info()->data_layout();
- const auto kernel_descriptor = InstanceNormalizationLayerKernelInfo{ gamma, beta, epsilon, true };
+ const auto kernel_descriptor = InstanceNormalizationLayerKernelInfo{gamma, beta, epsilon, true};
// Configure Kernels
_is_nchw = data_layout == DataLayout::NCHW;
_normalization_kernel = std::make_unique<NEInstanceNormalizationLayerKernel>();
- if(!_is_nchw)
+ if (!_is_nchw)
{
_memory_group.manage(&_permuted_input);
_memory_group.manage(&_permuted_output);
@@ -69,11 +79,12 @@ void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, fl
}
}
-Status NEInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
+Status NEInstanceNormalizationLayer::validate(
+ const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
{
- return NEInstanceNormalizationLayerKernel::validate(&input->clone()->set_data_layout(DataLayout::NCHW),
- &output->clone()->set_data_layout(DataLayout::NCHW),
- InstanceNormalizationLayerKernelInfo{ gamma, beta, epsilon, true });
+ return NEInstanceNormalizationLayerKernel::validate(
+ &input->clone()->set_data_layout(DataLayout::NCHW), &output->clone()->set_data_layout(DataLayout::NCHW),
+ InstanceNormalizationLayerKernelInfo{gamma, beta, epsilon, true});
}
void NEInstanceNormalizationLayer::run()
@@ -81,7 +92,7 @@ void NEInstanceNormalizationLayer::run()
MemoryGroupResourceScope scope_mg(_memory_group);
// Permute input
- if(!_is_nchw)
+ if (!_is_nchw)
{
_permute_input.run();
}
@@ -89,7 +100,7 @@ void NEInstanceNormalizationLayer::run()
NEScheduler::get().schedule(_normalization_kernel.get(), Window::DimZ);
// Permute output
- if(!_is_nchw)
+ if (!_is_nchw)
{
_permute_output.run();
}
diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index 505ee0a962..b7f6203efd 100644
--- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,6 +25,8 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
@@ -43,6 +45,8 @@ NEL2NormalizeLayer::NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_ma
void NEL2NormalizeLayer::configure(ITensor *input, ITensor *output, int axis, float epsilon)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, axis, epsilon);
+
// Manage intermediate buffers
_memory_group.manage(&_sumsq);
@@ -66,7 +70,8 @@ Status NEL2NormalizeLayer::validate(const ITensorInfo *input, const ITensorInfo
sum_sq.set_tensor_shape(shape);
const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
- ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
// Reduce shape on axis
shape.set(actual_axis, 1);
diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
index d338e4fd2d..1a08cdeb06 100644
--- a/src/runtime/NEON/functions/NELSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,20 +24,13 @@
#include "arm_compute/runtime/NEON/functions/NELSTMLayer.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/InfoHelpers.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/common/LSTMParams.h"
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+
+#include "src/common/utils/Log.h"
namespace arm_compute
{
@@ -47,35 +40,122 @@ using namespace arm_compute::utils::info_helpers;
NELSTMLayer::~NELSTMLayer() = default;
NELSTMLayer::NELSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(),
- _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _transpose_cell_state(),
- _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(),
- _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _projection_clip(),
- _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
- _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(),
- _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(),
- _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(),
- _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(),
- _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(),
- _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false),
+ : _memory_group(std::move(memory_manager)),
+ _fully_connected_input_gate(),
+ _accum_input_gate1(),
+ _subtract_input_gate(),
+ _pixelwise_mul_input_gate(),
+ _activation_input_gate(),
+ _fully_connected_forget_gate(),
+ _accum_forget_gate1(),
+ _pixelwise_mul_forget_gate(),
+ _activation_forget_gate(),
+ _fully_connected_cell_state(),
+ _gemm_cell_state1(),
+ _transpose_cell_state(),
+ _accum_cell_state1(),
+ _accum_cell_state2(),
+ _pixelwise_mul_cell_state1(),
+ _activation_cell_state(),
+ _cell_clip(),
+ _pixelwise_mul_cell_state2(),
+ _fully_connected_output(),
+ _pixelwise_mul_output_state1(),
+ _accum_output1(),
+ _activation_output(),
+ _activation_output_state(),
+ _pixelwise_mul_output_state2(),
+ _fully_connected_output_state(),
+ _projection_clip(),
+ _copy_cell_state(),
+ _copy_output(),
+ _concat_scratch_buffer(),
+ _concat_inputs_forget_gate(),
+ _concat_weights_forget_gate(),
+ _concat_weights_input_gate(),
+ _concat_weights_output(),
+ _mean_std_norm_input_gate(),
+ _pixelwise_mul_input_gate_coeff(),
+ _accum_input_gate_bias(),
+ _mean_std_norm_forget_gate(),
+ _pixelwise_mul_forget_gate_coeff(),
+ _accum_forget_gate_bias(),
+ _mean_std_norm_cell_gate(),
+ _pixelwise_mul_cell_gate_coeff(),
+ _accum_cell_gate_bias(),
+ _mean_std_norm_output_gate(),
+ _pixelwise_mul_output_gate_coeff(),
+ _accum_output_gate_bias(),
+ _input_gate_out1(),
+ _input_gate_out2(),
+ _input_gate_out3(),
+ _input_gate_out4(),
+ _forget_gate_out1(),
+ _forget_gate_out2(),
+ _forget_gate_out3(),
+ _forget_gate_out4(),
+ _forget_gate_out5(),
+ _forget_gate_out6(),
+ _cell_state_out1(),
+ _cell_state_out2(),
+ _cell_state_out3(),
+ _cell_state_out4(),
+ _cell_state_out5(),
+ _output1(),
+ _output2(),
+ _output3(),
+ _output4(),
+ _cell_state_activation(),
+ _output_state1(),
+ _ones(),
+ _input_layer_norm_out1(),
+ _input_layer_norm_out2(),
+ _forget_layer_norm_out1(),
+ _forget_layer_norm_out2(),
+ _cell_layer_norm_out1(),
+ _cell_layer_norm_out2(),
+ _output_layer_norm_out1(),
+ _output_layer_norm_out2(),
+ _run_peephole_opt(false),
+ _run_cifg_opt(false),
+ _perform_cell_clipping(false),
+ _has_projection_weights(false),
+ _perform_projection_clipping(false),
+ _is_prepared(false),
_is_layer_norm_lstm(false)
{
}
-void NELSTMLayer::configure(const ITensor *input,
- const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
- const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
- const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
- const ITensor *output_state_in, const ITensor *cell_state_in,
- ITensor *scratch_buffer, ITensor *output_state_out, ITensor *cell_state_out, ITensor *output,
- const LSTMParams<ITensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+void NELSTMLayer::configure(const ITensor *input,
+ const ITensor *input_to_forget_weights,
+ const ITensor *input_to_cell_weights,
+ const ITensor *input_to_output_weights,
+ const ITensor *recurrent_to_forget_weights,
+ const ITensor *recurrent_to_cell_weights,
+ const ITensor *recurrent_to_output_weights,
+ const ITensor *forget_gate_bias,
+ const ITensor *cell_bias,
+ const ITensor *output_gate_bias,
+ const ITensor *output_state_in,
+ const ITensor *cell_state_in,
+ ITensor *scratch_buffer,
+ ITensor *output_state_out,
+ ITensor *cell_state_out,
+ ITensor *output,
+ const LSTMParams<ITensor> &lstm_params,
+ const ActivationLayerInfo &activation_info,
+ float cell_threshold,
+ float projection_threshold)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input,
- input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
- forget_gate_bias, cell_bias, output_gate_bias,
- output_state_in, cell_state_in,
+ forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
scratch_buffer, output_state_out, cell_state_out, output);
+ ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+ forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
+ scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
+ cell_threshold, projection_threshold);
_is_layer_norm_lstm = lstm_params.use_layer_norm();
@@ -84,13 +164,12 @@ void NELSTMLayer::configure(const ITensor *input,
build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
// Validate
- ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate(input->info(), input_to_forget_weights->info(),
- input_to_cell_weights->info(), input_to_output_weights->info(),
- recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
- forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
- output_state_in->info(), cell_state_in->info(),
- scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
- lstm_params_info, activation_info, cell_threshold, projection_threshold));
+ ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate(
+ input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+ recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+ forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(),
+ cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
+ lstm_params_info, activation_info, cell_threshold, projection_threshold));
const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape();
@@ -117,20 +196,23 @@ void NELSTMLayer::configure(const ITensor *input,
_concat_weights_forget_gate.configure(weights_vector, &_forget_gate_out6, Window::DimX);
_memory_group.manage(&_forget_gate_out5);
- _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
+ _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6,
+ (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
_memory_group.manage(&_forget_gate_out1);
_memory_group.manage(&_forget_gate_out3);
_forget_gate_out6.allocator()->allocate();
Tensor *forget_gate_out = &_forget_gate_out5;
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
_forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_run_peephole_opt = true;
_memory_group.manage(&_forget_gate_out4);
- _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
- _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
+ _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3,
+ ConvertPolicy::SATURATE);
_forget_gate_out4.allocator()->allocate();
_forget_gate_out5.allocator()->allocate();
forget_gate_out = &_forget_gate_out3;
@@ -139,21 +221,25 @@ void NELSTMLayer::configure(const ITensor *input,
{
_forget_gate_out3.allocator()->allocate();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_forget_layer_norm_out1);
_memory_group.manage(&_forget_layer_norm_out2);
_mean_std_norm_forget_gate.configure(forget_gate_out);
- _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(),
+ &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
// forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
forget_gate_out->allocator()->allocate();
- _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
+ _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2,
+ ConvertPolicy::SATURATE);
_forget_layer_norm_out1.allocator()->allocate();
forget_gate_out = &_forget_layer_norm_out2;
}
- _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _activation_forget_gate.configure(forget_gate_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
// Configure block that calculates the input gate
// input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
@@ -162,7 +248,7 @@ void NELSTMLayer::configure(const ITensor *input,
// input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
_input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
Tensor *input_gate_out = &_input_gate_out1;
- if(lstm_params.has_cifg_opt())
+ if (lstm_params.has_cifg_opt())
{
_memory_group.manage(&_input_gate_out1);
_ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
@@ -184,15 +270,19 @@ void NELSTMLayer::configure(const ITensor *input,
_memory_group.manage(&_input_gate_out1);
_memory_group.manage(&_input_gate_out4);
- _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3);
+ _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2,
+ (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(),
+ &_input_gate_out3);
_input_gate_out2.allocator()->allocate();
input_gate_out = &_input_gate_out3;
- if(_run_peephole_opt)
+ if (_run_peephole_opt)
{
_memory_group.manage(&_input_gate_out4);
- _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
- _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+ _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4,
+ 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1,
+ ConvertPolicy::SATURATE);
_input_gate_out3.allocator()->allocate();
_input_gate_out4.allocator()->allocate();
input_gate_out = &_input_gate_out1;
@@ -202,21 +292,25 @@ void NELSTMLayer::configure(const ITensor *input,
_input_gate_out1.allocator()->allocate();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_input_layer_norm_out1);
_memory_group.manage(&_input_layer_norm_out2);
_mean_std_norm_input_gate.configure(input_gate_out);
- _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(),
+ &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
// input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
input_gate_out->allocator()->allocate();
- _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
+ _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(),
+ &_input_layer_norm_out2, ConvertPolicy::SATURATE);
_input_layer_norm_out1.allocator()->allocate();
input_gate_out = &_input_layer_norm_out2;
}
- _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _activation_input_gate.configure(input_gate_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
}
// Configure block that calculates the cell state
@@ -229,7 +323,8 @@ void NELSTMLayer::configure(const ITensor *input,
_cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_cell_state_out1);
- _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
+ _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias,
+ &_cell_state_out1);
_memory_group.manage(&_cell_state_out2);
_transpose_cell_state.configure(recurrent_to_cell_weights, &_cell_state_out2);
_memory_group.manage(&_cell_state_out3);
@@ -238,33 +333,40 @@ void NELSTMLayer::configure(const ITensor *input,
_memory_group.manage(&_cell_state_out4);
_accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
Tensor *cell_state_out_ptr = &_cell_state_out4;
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_cell_layer_norm_out1);
_memory_group.manage(&_cell_layer_norm_out2);
_mean_std_norm_cell_gate.configure(cell_state_out_ptr);
- _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(),
+ &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
// cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before
cell_state_out_ptr->allocator()->allocate();
- _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
+ _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2,
+ ConvertPolicy::SATURATE);
_cell_layer_norm_out1.allocator()->allocate();
cell_state_out_ptr = &_cell_layer_norm_out2;
}
_activation_cell_state.configure(cell_state_out_ptr, nullptr, activation_info);
_memory_group.manage(&_cell_state_out5);
- _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
cell_state_out_ptr->allocator()->allocate();
- _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
_accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
_cell_state_out3.allocator()->allocate();
_cell_state_out5.allocator()->allocate();
// Perform clipping
- if(cell_threshold != 0.f)
+ if (cell_threshold != 0.f)
{
_perform_cell_clipping = true;
- _cell_clip.configure(&_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold, cell_threshold));
+ _cell_clip.configure(&_cell_state_out1, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ cell_threshold, -cell_threshold));
}
// Configure block that calculates the output
@@ -282,18 +384,20 @@ void NELSTMLayer::configure(const ITensor *input,
_memory_group.manage(&_output1);
_memory_group.manage(&_output4);
- _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
+ _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias,
+ &_output4);
_output2.allocator()->allocate();
_forget_gate_out2.allocator()->allocate();
Tensor *output_gate_out = &_output4;
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
_output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
_memory_group.manage(&_output3);
- _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_accum_output1.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
_output4.allocator()->allocate();
output_gate_out = &_output1;
@@ -305,21 +409,25 @@ void NELSTMLayer::configure(const ITensor *input,
{
_output1.allocator()->allocate();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
_memory_group.manage(&_output_layer_norm_out1);
_memory_group.manage(&_output_layer_norm_out2);
_mean_std_norm_output_gate.configure(output_gate_out);
- _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(),
+ &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
// output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
output_gate_out->allocator()->allocate();
- _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
+ _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2,
+ ConvertPolicy::SATURATE);
_output_layer_norm_out1.allocator()->allocate();
output_gate_out = &_output_layer_norm_out2;
}
- _activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _activation_output.configure(output_gate_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
// Configure block that calculates the output state
/** lstm_res = PixelwiseMul(output, Activation(cell_state))
@@ -336,20 +444,24 @@ void NELSTMLayer::configure(const ITensor *input,
_memory_group.manage(&_cell_state_activation);
_activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info);
- _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
_cell_state_activation.allocator()->allocate();
output_gate_out->allocator()->allocate();
- if(lstm_params.has_projection())
+ if (lstm_params.has_projection())
{
_has_projection_weights = true;
- _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out);
+ _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(),
+ lstm_params.projection_bias(), output_state_out);
_output_state1.allocator()->allocate();
// Perform clipping
- if(projection_threshold != 0.f)
+ if (projection_threshold != 0.f)
{
_perform_projection_clipping = true;
- _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+ _projection_clip.configure(output_state_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ -projection_threshold, projection_threshold));
}
}
@@ -359,7 +471,7 @@ void NELSTMLayer::configure(const ITensor *input,
// Vector for holding the tensors to store in scratch buffer
std::vector<const ITensor *> scratch_inputs;
- if(!lstm_params.has_cifg_opt())
+ if (!lstm_params.has_cifg_opt())
{
scratch_inputs.emplace_back(input_gate_out);
}
@@ -373,29 +485,38 @@ void NELSTMLayer::configure(const ITensor *input,
output_gate_out->allocator()->allocate();
}
-Status NELSTMLayer::validate(const ITensorInfo *input,
- const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
- const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
- const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
- const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
- const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
- const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+Status NELSTMLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *input_to_forget_weights,
+ const ITensorInfo *input_to_cell_weights,
+ const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_forget_weights,
+ const ITensorInfo *recurrent_to_cell_weights,
+ const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *forget_gate_bias,
+ const ITensorInfo *cell_bias,
+ const ITensorInfo *output_gate_bias,
+ const ITensorInfo *output_state_in,
+ const ITensorInfo *cell_state_in,
+ const ITensorInfo *scratch_buffer,
+ const ITensorInfo *output_state_out,
+ const ITensorInfo *cell_state_out,
+ const ITensorInfo *output,
+ const LSTMParams<ITensorInfo> &lstm_params,
+ const ActivationLayerInfo &activation_info,
+ float cell_threshold,
+ float projection_threshold)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input,
- input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
- recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
- forget_gate_bias, cell_bias, output_gate_bias,
- output_state_in, cell_state_in,
- scratch_buffer, output_state_out, cell_state_out, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+ input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+ output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
// Check data types
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input,
- input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
- recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
- forget_gate_bias, cell_bias, output_gate_bias,
- output_state_in, cell_state_in,
- scratch_buffer, output_state_out, cell_state_out, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(
+ input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+ output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
// Check dimensions
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
@@ -414,16 +535,16 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
- ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0)
- && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
+ ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) &&
+ cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
const unsigned int num_batches = input->dimension(1);
const unsigned int num_cells = input_to_output_weights->dimension(1);
- if(lstm_params.use_layer_norm())
+ if (lstm_params.use_layer_norm())
{
// If CIFG is used, input layer normalization weights tensor is omitted
- if(lstm_params.has_cifg_opt())
+ if (lstm_params.has_cifg_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr);
}
@@ -435,8 +556,12 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights());
}
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(),
+ lstm_params.cell_layer_norm_weights(),
+ lstm_params.output_layer_norm_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(),
+ lstm_params.cell_layer_norm_weights(),
+ lstm_params.output_layer_norm_weights());
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1);
@@ -446,7 +571,7 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
}
// Check peephole optimization
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights());
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1);
@@ -466,33 +591,39 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
std::vector<const ITensorInfo *> inputs_vector;
inputs_vector.emplace_back(input);
inputs_vector.emplace_back(output_state_in);
- const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+ const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
TensorInfo forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type());
ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX));
// Validate forget gate
- ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+ input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
}
- if(lstm_params.use_layer_norm())
+ if (lstm_params.use_layer_norm())
{
ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&forget_gate));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
}
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+ &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// Validate input gate
- if(!lstm_params.has_cifg_opt())
+ if (!lstm_params.has_cifg_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
- lstm_params.recurrent_to_input_weights(),
- lstm_params.input_gate_bias());
+ lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
@@ -500,88 +631,120 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
std::vector<const ITensorInfo *> lstm_weights;
lstm_weights.emplace_back(lstm_params.input_to_input_weights());
lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
- TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
- TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
+ TensorShape lstm_weights_concat_shape =
+ arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+ TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX));
- ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+ input, lstm_params.input_to_input_weights(),
+ (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1);
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
}
- if(lstm_params.use_layer_norm())
+ if (lstm_params.use_layer_norm())
{
ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&input_gate));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(),
+ &input_gate, ConvertPolicy::SATURATE));
}
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+ &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
}
else
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
}
// Validate cell state
- ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
- if(lstm_params.use_layer_norm())
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+ input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+ if (lstm_params.use_layer_norm())
{
ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&cell_state_tmp));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp,
+ 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
}
ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, activation_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
- if(cell_threshold != 0.f)
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+ if (cell_threshold != 0.f)
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold,
- cell_threshold)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEActivationLayer::validate(&cell_state_tmp, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ cell_threshold, -cell_threshold)));
}
// Validate output gate tmp
std::vector<const ITensorInfo *> in_out_weights;
in_out_weights.emplace_back(input_to_output_weights);
in_out_weights.emplace_back(recurrent_to_output_weights);
- TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
- TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
+ TensorShape in_out_weights_concat_shape =
+ arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+ TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX));
- ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+ input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp,
+ 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp,
+ ConvertPolicy::SATURATE));
}
- if(lstm_params.use_layer_norm())
+ if (lstm_params.use_layer_norm())
{
ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&output_gate_tmp));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(),
+ &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp,
+ ConvertPolicy::SATURATE));
}
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+ &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// Validate output state
ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
- if(lstm_params.has_projection())
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+ &cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ if (lstm_params.has_projection())
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out));
- if(projection_threshold != 0.f)
+ ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(),
+ lstm_params.projection_bias(), output_state_out));
+ if (projection_threshold != 0.f)
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output_state_out, output_state_out,
- ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+ output_state_out, output_state_out,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold,
+ projection_threshold)));
}
}
@@ -591,7 +754,7 @@ Status NELSTMLayer::validate(const ITensorInfo *input,
// Validate scratch concatenation
std::vector<const ITensorInfo *> inputs_vector_info_raw;
- if(!lstm_params.has_cifg_opt())
+ if (!lstm_params.has_cifg_opt())
{
inputs_vector_info_raw.push_back(&input_gate);
}
@@ -612,12 +775,12 @@ void NELSTMLayer::run()
_concat_inputs_forget_gate.run();
_fully_connected_forget_gate.run();
- if(_run_peephole_opt)
+ if (_run_peephole_opt)
{
_pixelwise_mul_forget_gate.run();
_accum_forget_gate1.run();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_mean_std_norm_forget_gate.run();
_pixelwise_mul_forget_gate_coeff.run();
@@ -625,15 +788,17 @@ void NELSTMLayer::run()
}
_activation_forget_gate.run();
- if(_run_cifg_opt)
+ if (_run_cifg_opt)
{
- if(_ones.info()->data_type() == DataType::F16)
+ if (_ones.info()->data_type() == DataType::F16)
{
- std::fill_n(reinterpret_cast<half *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
+ std::fill_n(reinterpret_cast<half *>(_ones.buffer()),
+ _ones.info()->total_size() / _ones.info()->element_size(), 1);
}
else
{
- std::fill_n(reinterpret_cast<float *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
+ std::fill_n(reinterpret_cast<float *>(_ones.buffer()),
+ _ones.info()->total_size() / _ones.info()->element_size(), 1);
}
_subtract_input_gate.run();
}
@@ -641,13 +806,13 @@ void NELSTMLayer::run()
{
_fully_connected_input_gate.run();
- if(_run_peephole_opt)
+ if (_run_peephole_opt)
{
_pixelwise_mul_input_gate.run();
_accum_input_gate1.run();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_mean_std_norm_input_gate.run();
_pixelwise_mul_input_gate_coeff.run();
@@ -660,29 +825,30 @@ void NELSTMLayer::run()
_transpose_cell_state.run();
_gemm_cell_state1.run();
_accum_cell_state1.run();
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_mean_std_norm_cell_gate.run();
_pixelwise_mul_cell_gate_coeff.run();
_accum_cell_gate_bias.run();
}
+
_activation_cell_state.run();
_pixelwise_mul_cell_state1.run();
_pixelwise_mul_cell_state2.run();
_accum_cell_state2.run();
- if(_perform_cell_clipping)
+ if (_perform_cell_clipping)
{
_cell_clip.run();
}
_fully_connected_output.run();
- if(_run_peephole_opt)
+ if (_run_peephole_opt)
{
_pixelwise_mul_output_state1.run();
_accum_output1.run();
}
- if(_is_layer_norm_lstm)
+ if (_is_layer_norm_lstm)
{
_mean_std_norm_output_gate.run();
_pixelwise_mul_output_gate_coeff.run();
@@ -693,10 +859,10 @@ void NELSTMLayer::run()
_activation_output_state.run();
_pixelwise_mul_output_state2.run();
- if(_has_projection_weights)
+ if (_has_projection_weights)
{
_fully_connected_output_state.run();
- if(_perform_projection_clipping)
+ if (_perform_projection_clipping)
{
_projection_clip.run();
}
@@ -710,10 +876,10 @@ void NELSTMLayer::run()
void NELSTMLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
_concat_weights_forget_gate.run();
- if(!_run_cifg_opt)
+ if (!_run_cifg_opt)
{
_concat_weights_input_gate.run();
}
diff --git a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
index 5c0f19a15c..41f9c3d700 100644
--- a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
@@ -24,17 +24,10 @@
#include "arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/helpers/AutoConfiguration.h"
#include <cmath>
@@ -54,32 +47,104 @@ const QuantizationInfo qsymm_0(1.f / 32768.f, 0); // qsymm16 with 0 integer bit
NELSTMLayerQuantized::~NELSTMLayerQuantized() = default;
NELSTMLayerQuantized::NELSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(),
- _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add1(), _add2(), _mul1(), _mul2(), _mul3(),
- _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(), _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr),
- _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr), _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr),
- _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr), _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(),
- _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(), _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(),
- _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state1(), _cell_state2(), _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(),
+ : _memory_group(std::move(memory_manager)),
+ _gemmlowp(),
+ _output_stage(),
+ _transpose_weights(),
+ _concat_input_weights(),
+ _concat_recurrent_weights(),
+ _concat_weights(),
+ _concat_inputs(),
+ _concat_bias(),
+ _sigmoid_forget_gate(),
+ _sigmoid_input_gate(),
+ _sigmoid_output_gate(),
+ _tanh_modulation_gate(),
+ _tanh_output_state(),
+ _add1(),
+ _add2(),
+ _mul1(),
+ _mul2(),
+ _mul3(),
+ _slice_input_tensor(),
+ _slice_forget_tensor(),
+ _slice_cell_tensor(),
+ _slice_output_tensor(),
+ _dequantize(),
+ _quantize(),
+ _input_to_input_weights(nullptr),
+ _input_to_forget_weights(nullptr),
+ _input_to_cell_weights(nullptr),
+ _input_to_output_weights(nullptr),
+ _recurrent_to_input_weights(nullptr),
+ _recurrent_to_forget_weights(nullptr),
+ _recurrent_to_cell_weights(nullptr),
+ _recurrent_to_output_weights(nullptr),
+ _input_gate_bias(nullptr),
+ _forget_gate_bias(nullptr),
+ _cell_bias(nullptr),
+ _output_gate_bias(nullptr),
+ _recurrent_weights(),
+ _input_weights(),
+ _weights(),
+ _input(),
+ _weights_transposed(),
+ _output_highp(),
+ _output_lowp(),
+ _bias(),
+ _forget_gate_input(),
+ _input_gate_input(),
+ _output_gate_input(),
+ _input_modulation_gate_input(),
+ _forget_gate_output(),
+ _input_gate_output(),
+ _output_gate_output(),
+ _input_modulation_gate_output(),
+ _cell_state1(),
+ _cell_state2(),
+ _output_state_tmp(),
+ _output_state_out_symm(),
+ _output_state_out_f32(),
_is_prepared(false)
{
}
void NELSTMLayerQuantized::configure(const ITensor *input,
- const ITensor *input_to_input_weights, const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
- const ITensor *recurrent_to_input_weights, const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
- const ITensor *input_gate_bias, const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
- ITensor *cell_state_in, const ITensor *output_state_in,
- ITensor *cell_state_out, ITensor *output_state_out)
+ const ITensor *input_to_input_weights,
+ const ITensor *input_to_forget_weights,
+ const ITensor *input_to_cell_weights,
+ const ITensor *input_to_output_weights,
+ const ITensor *recurrent_to_input_weights,
+ const ITensor *recurrent_to_forget_weights,
+ const ITensor *recurrent_to_cell_weights,
+ const ITensor *recurrent_to_output_weights,
+ const ITensor *input_gate_bias,
+ const ITensor *forget_gate_bias,
+ const ITensor *cell_bias,
+ const ITensor *output_gate_bias,
+ ITensor *cell_state_in,
+ const ITensor *output_state_in,
+ ITensor *cell_state_out,
+ ITensor *output_state_out)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
- recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
- input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
-
- ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
- input_to_output_weights->info(),
- recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
- input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info()));
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+ input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias,
+ forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+ cell_state_out, output_state_out);
+
+ ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate(
+ input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+ input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(),
+ recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(),
+ forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(),
+ output_state_in->info(), cell_state_out->info(), output_state_out->info()));
+
+ ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+ input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias,
+ cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+ output_state_out);
const int input_size = input->info()->dimension(0);
const int batch_size = input->info()->dimension(1);
@@ -87,8 +152,10 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization
- auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
- auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
+ auto_init_if_empty(*cell_state_out->info(),
+ TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
+ auto_init_if_empty(*output_state_out->info(),
+ TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
_input_to_input_weights = input_to_input_weights;
_input_to_forget_weights = input_to_forget_weights;
@@ -104,34 +171,41 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
_output_gate_bias = output_gate_bias;
// Weights concatenation
- std::vector<const ITensor *> inputs_weights_vector{ input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights };
- std::vector<const ITensor *> recurrent_weights_vector{ recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights };
+ std::vector<const ITensor *> inputs_weights_vector{input_to_input_weights, input_to_forget_weights,
+ input_to_cell_weights, input_to_output_weights};
+ std::vector<const ITensor *> recurrent_weights_vector{recurrent_to_input_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights};
- _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+ _input_weights.allocator()->init(
+ TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
_concat_input_weights.configure(inputs_weights_vector, &_input_weights, Window::DimY);
- _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+ _recurrent_weights.allocator()->init(
+ TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
_concat_recurrent_weights.configure(recurrent_weights_vector, &_recurrent_weights, Window::DimY);
- std::vector<const ITensor *> weights_vector{ &_recurrent_weights, &_input_weights };
- _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+ std::vector<const ITensor *> weights_vector{&_recurrent_weights, &_input_weights};
+ _weights.allocator()->init(
+ TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
_concat_weights.configure(weights_vector, &_weights, Window::DimX);
_transpose_weights.configure(&_weights, &_weights_transposed);
// Input concatenation
- std::vector<const ITensor *> input_vector{ input, output_state_in };
+ std::vector<const ITensor *> input_vector{input, output_state_in};
_memory_group.manage(&_input);
- _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
+ _input.allocator()->init(
+ TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
_concat_inputs.configure(input_vector, &_input, Window::DimX);
// Bias concatenation
- std::vector<const ITensor *> bias_vector{ input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias };
+ std::vector<const ITensor *> bias_vector{input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias};
_bias.allocator()->init(TensorInfo(TensorShape(4 * output_size), 1, DataType::S32));
_concat_bias.configure(bias_vector, &_bias, Window::DimX);
// Invert the offset for gemmlowp
_input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset));
- _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
+ _weights_transposed.info()->set_quantization_info(
+ QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
// Run gemmlowp
_memory_group.manage(&_output_highp);
@@ -141,7 +215,8 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
// Set the offset back
_input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
- _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
+ _weights_transposed.info()->set_quantization_info(
+ QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
// multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
_output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3));
@@ -152,69 +227,91 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
_memory_group.manage(&_output_lowp);
- _output_stage.configure(&_output_highp, &_bias, &_output_lowp, output_multiplier, output_shift);
+
+ GEMMLowpOutputStageInfo info;
+ info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ info.gemmlowp_multiplier = output_multiplier;
+ info.gemmlowp_shift = output_shift;
+ info.output_data_type = DataType::QSYMM16;
+ _output_stage.configure(&_output_highp, &_bias, &_output_lowp, info);
_output_highp.allocator()->allocate();
_bias.allocator()->allocate();
// Get the gate tensors
- if(batch_size > 1)
+ if (batch_size > 1)
{
_memory_group.manage(&_input_gate_input);
- _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size });
+ _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, {0, 0}, {output_size, batch_size});
_memory_group.manage(&_forget_gate_input);
- _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size });
+ _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, {output_size, 0},
+ {2 * output_size, batch_size});
_memory_group.manage(&_input_modulation_gate_input);
- _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size });
+ _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, {2 * output_size, 0},
+ {3 * output_size, batch_size});
_memory_group.manage(&_output_gate_input);
- _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size });
+ _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, {3 * output_size, 0},
+ {4 * output_size, batch_size});
_output_lowp.allocator()->allocate();
}
else
{
_memory_group.manage(&_input_gate_input);
- _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0 }, { output_size });
+ _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, {0}, {output_size});
_memory_group.manage(&_forget_gate_input);
- _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size });
+ _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, {output_size}, {2 * output_size});
_memory_group.manage(&_input_modulation_gate_input);
- _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size });
+ _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, {2 * output_size},
+ {3 * output_size});
_memory_group.manage(&_output_gate_input);
- _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size });
+ _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, {3 * output_size}, {4 * output_size});
_output_lowp.allocator()->allocate();
}
// Forget gate
_memory_group.manage(&_forget_gate_output);
- _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _forget_gate_output.allocator()->init(
+ TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
_forget_gate_input.allocator()->allocate();
// Input gate
_memory_group.manage(&_input_gate_output);
- _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _input_gate_output.allocator()->init(
+ TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
_input_gate_input.allocator()->allocate();
// Input modulation gate equation
_memory_group.manage(&_input_modulation_gate_output);
- _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+ _input_modulation_gate_output.allocator()->init(
+ TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
_input_modulation_gate_input.allocator()->allocate();
// Output gate
_memory_group.manage(&_output_gate_output);
- _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _output_gate_output.allocator()->init(
+ TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
_output_gate_input.allocator()->allocate();
// Long term memory
_memory_group.manage(&_cell_state1);
- _cell_state1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
- _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _cell_state1.allocator()->init(
+ TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+ _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
_forget_gate_output.allocator()->allocate();
_memory_group.manage(&_cell_state2);
- _cell_state2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
- _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _cell_state2.allocator()->init(
+ TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+ _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
_input_modulation_gate_output.allocator()->allocate();
_input_gate_output.allocator()->allocate();
@@ -224,18 +321,23 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
// Short term memory
_memory_group.manage(&_output_state_tmp);
- _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _tanh_output_state.configure(cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+ _output_state_tmp.allocator()->init(
+ TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _tanh_output_state.configure(cell_state_out, &_output_state_tmp,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
_memory_group.manage(&_output_state_out_symm);
- _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
- _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _output_state_out_symm.allocator()->init(
+ TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+ _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
_output_gate_output.allocator()->allocate();
_output_state_tmp.allocator()->allocate();
// Requantize the output state from QSYMM16 to QASYMM8
_memory_group.manage(&_output_state_out_f32);
- _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
+ _output_state_out_f32.allocator()->init(
+ TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
_dequantize.configure(&_output_state_out_symm, &_output_state_out_f32);
_output_state_out_symm.allocator()->allocate();
@@ -244,15 +346,28 @@ void NELSTMLayerQuantized::configure(const ITensor *input,
}
Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
- const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
- const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
- const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
- const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
- const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out)
+ const ITensorInfo *input_to_input_weights,
+ const ITensorInfo *input_to_forget_weights,
+ const ITensorInfo *input_to_cell_weights,
+ const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_input_weights,
+ const ITensorInfo *recurrent_to_forget_weights,
+ const ITensorInfo *recurrent_to_cell_weights,
+ const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *input_gate_bias,
+ const ITensorInfo *forget_gate_bias,
+ const ITensorInfo *cell_bias,
+ const ITensorInfo *output_gate_bias,
+ const ITensorInfo *cell_state_in,
+ const ITensorInfo *output_state_in,
+ const ITensorInfo *cell_state_out,
+ const ITensorInfo *output_state_out)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
- recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
- output_state_in, cell_state_out, output_state_out);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+ input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+ input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+ output_state_out);
const int input_size = input->dimension(0);
const int batch_size = input->dimension(1);
@@ -264,29 +379,51 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1);
ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2);
- TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8));
- TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8));
- TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
- TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm));
- TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4));
+ TensorInfo input_weights_info(input_to_input_weights->clone()
+ ->set_tensor_shape(TensorShape(input_size, output_size))
+ .set_data_type(DataType::QASYMM8));
+ TensorInfo recurrent_weights_info(input_to_input_weights->clone()
+ ->set_tensor_shape(TensorShape(output_size, output_size))
+ .set_data_type(DataType::QASYMM8));
+ TensorInfo bias_info(
+ input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
+ TensorInfo output_state_info(cell_state_in->clone()
+ ->set_tensor_shape(TensorShape(output_size, batch_size))
+ .set_data_type(DataType::QASYMM8)
+ .set_quantization_info(qasymm));
+ TensorInfo cell_state_info(cell_state_in->clone()
+ ->set_tensor_shape(TensorShape(output_size, batch_size))
+ .set_data_type(DataType::QSYMM16)
+ .set_quantization_info(qsymm_4));
// Shape checks
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights,
+ input_to_cell_weights, input_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights,
+ recurrent_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+ output_gate_bias);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in);
// Data type checks
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights,
+ input_to_forget_weights, input_to_cell_weights,
+ input_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+ output_gate_bias);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in);
// Quantization checks
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights,
+ input_to_forget_weights, input_to_cell_weights,
+ input_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in);
@@ -308,7 +445,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights);
- ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
// _concat_weights
std::vector<const ITensorInfo *> weights_vector;
@@ -318,7 +456,7 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(weights_vector, &weights, Window::DimX));
// _transpose_weights
const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]);
- TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
+ TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(&weights, &weights_transposed));
// _concat_inputs
@@ -344,7 +482,8 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
// _gemmlowp
const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32);
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
// Set the offset back
input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
@@ -355,78 +494,107 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
const float multiplier = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
int32_t output_multiplier = 0;
int32_t output_shift = 0;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
// _output_stage
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::validate(&output_highp, &bias_concatenated, &output_lowp));
+ GEMMLowpOutputStageInfo info;
+ info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ info.gemmlowp_multiplier = output_multiplier;
+ info.gemmlowp_shift = output_shift;
+ info.output_data_type = DataType::QSYMM16;
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&output_highp, &bias_concatenated, &output_lowp, info));
TensorInfo input_gate_input;
TensorInfo forget_gate_input;
TensorInfo input_modulation_gate_input;
TensorInfo output_gate_input;
- if(batch_size > 1)
+ if (batch_size > 1)
{
// _slice_input_tensor
input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NESlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size}));
// _slice_forget_tensor
forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NESlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size}));
// _slice_cell_tensor
input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0},
+ {3 * output_size, batch_size}));
// _slice_output_tensor
output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NESlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size}));
}
else
{
// _slice_input_tensor
input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, {0}, {output_size}));
// _slice_forget_tensor
forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NESlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size}));
// _slice_cell_tensor
input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NESlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size}));
// _slice_output_tensor
output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
- ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size }));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NESlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size}));
}
// _sigmoid_forget_gate
const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEActivationLayer::validate(&forget_gate_input, &forget_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// _sigmoid_input_gate
const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+ &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// _tanh_modulation_gate
- const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+ const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16,
+ qsymm_0);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
// _sigmoid_output_gate
const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEActivationLayer::validate(&output_gate_input, &output_gate_output,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// _mul_forget_gate_cell_state
const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+ &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
// _mul_input_gate_input_mod_gate
const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output,
+ &cell_state_tmp2, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
// _add_cell_state_tmps
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
// _tanh_modulation_gate
const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEActivationLayer::validate(cell_state_out, &output_state_tmp,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
// _mul_output_state_tmp_output_gate
const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output,
+ &output_state_out_symm, 1, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
// _dequantize
const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32);
@@ -435,14 +603,14 @@ Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
// _quantize
ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&output_state_out_f32, output_state_out));
- if(cell_state_out->total_size() != 0)
+ if (cell_state_out->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out);
}
- if(output_state_out->total_size() != 0)
+ if (output_state_out->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out);
@@ -501,7 +669,7 @@ void NELSTMLayerQuantized::run()
void NELSTMLayerQuantized::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
_input_weights.allocator()->allocate();
_concat_input_weights.run();
diff --git a/src/runtime/NEON/functions/NELogical.cpp b/src/runtime/NEON/functions/NELogical.cpp
index 171d84da19..0013a521d1 100644
--- a/src/runtime/NEON/functions/NELogical.cpp
+++ b/src/runtime/NEON/functions/NELogical.cpp
@@ -25,21 +25,22 @@
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/Tensor.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NELogicalKernel.h"
namespace arm_compute
{
struct LogicalArgs
{
- std::unique_ptr<kernels::NELogicalKernel> kernel{ nullptr };
+ std::unique_ptr<kernels::NELogicalKernel> kernel{nullptr};
ITensorPack pack{};
};
struct NELogicalAnd::Impl : public LogicalArgs
{
};
-NELogicalAnd::NELogicalAnd()
- : _impl(std::make_unique<Impl>())
+NELogicalAnd::NELogicalAnd() : _impl(std::make_unique<Impl>())
{
}
NELogicalAnd::~NELogicalAnd() = default;
@@ -47,6 +48,7 @@ NELogicalAnd::~NELogicalAnd() = default;
void NELogicalAnd::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
_impl->kernel = std::make_unique<kernels::NELogicalKernel>();
_impl->kernel->configure(input1->info(), input2->info(), output->info(), LogicalOperation::And);
@@ -70,8 +72,7 @@ void NELogicalAnd::run()
struct NELogicalOr::Impl : public LogicalArgs
{
};
-NELogicalOr::NELogicalOr()
- : _impl(std::make_unique<Impl>())
+NELogicalOr::NELogicalOr() : _impl(std::make_unique<Impl>())
{
}
NELogicalOr::~NELogicalOr() = default;
@@ -79,6 +80,7 @@ NELogicalOr::~NELogicalOr() = default;
void NELogicalOr::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+ ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
_impl->kernel = std::make_unique<kernels::NELogicalKernel>();
_impl->kernel->configure(input1->info(), input2->info(), output->info(), LogicalOperation::Or);
@@ -102,8 +104,7 @@ void NELogicalOr::run()
struct NELogicalNot::Impl : public LogicalArgs
{
};
-NELogicalNot::NELogicalNot()
- : _impl(std::make_unique<Impl>())
+NELogicalNot::NELogicalNot() : _impl(std::make_unique<Impl>())
{
}
NELogicalNot::~NELogicalNot() = default;
@@ -111,6 +112,7 @@ NELogicalNot::~NELogicalNot() = default;
void NELogicalNot::configure(const ITensor *input, ITensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_LOG_PARAMS(input, output);
_impl->kernel = std::make_unique<kernels::NELogicalKernel>();
_impl->kernel->configure(input->info(), nullptr, output->info(), LogicalOperation::Not);
diff --git a/src/runtime/NEON/functions/NEMatMul.cpp b/src/runtime/NEON/functions/NEMatMul.cpp
new file mode 100644
index 0000000000..31898bafc4
--- /dev/null
+++ b/src/runtime/NEON/functions/NEMatMul.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEMatMul.h"
+
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuMatMul.h"
+
+namespace arm_compute
+{
+struct NEMatMul::Impl
+{
+ const ITensor *lhs{nullptr};
+ const ITensor *rhs{nullptr};
+ ITensor *output{nullptr};
+ std::unique_ptr<cpu::CpuMatMul> op{nullptr};
+ MemoryGroup memory_group{};
+ WorkspaceData<Tensor> workspace_tensors{};
+ ITensorPack run_pack{};
+};
+
+NEMatMul::NEMatMul() : _impl(std::make_unique<Impl>())
+{
+}
+
+NEMatMul::~NEMatMul() = default;
+
+void NEMatMul::configure(ITensor *lhs,
+ ITensor *rhs,
+ ITensor *output,
+ const MatMulInfo &info,
+ const CpuMatMulSettings &settings,
+ const ActivationLayerInfo &act_info)
+{
+ _impl->lhs = lhs;
+ _impl->rhs = rhs;
+ _impl->output = output;
+
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->lhs, _impl->rhs, _impl->output);
+ _impl->op = std::make_unique<cpu::CpuMatMul>();
+ _impl->op->configure(lhs->info(), rhs->info(), output->info(), info, settings, act_info);
+ _impl->run_pack = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}};
+ _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
+}
+
+Status NEMatMul::validate(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const ITensorInfo *output,
+ const MatMulInfo &info,
+ const CpuMatMulSettings &settings,
+ const ActivationLayerInfo &act_info)
+{
+ return cpu::CpuMatMul::validate(lhs, rhs, output, info, settings, act_info);
+}
+
+void NEMatMul::run()
+{
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
+ _impl->op->run(_impl->run_pack);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
index 656777d726..c3861afd2c 100644
--- a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,36 +25,66 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/NEON/functions/NEFill.h"
-#include "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h"
+#include "src/cpu/operators/CpuMaxUnpooling.h"
namespace arm_compute
{
+struct NEMaxUnpoolingLayer::Impl
+{
+ const ITensor *src{nullptr};
+ const ITensor *indices{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuMaxUnpooling> op{nullptr};
+};
+
NEMaxUnpoolingLayer::~NEMaxUnpoolingLayer() = default;
-NEMaxUnpoolingLayer::NEMaxUnpoolingLayer()
- : _fill_func(), _unpooling_layer_kernel()
+NEMaxUnpoolingLayer::NEMaxUnpoolingLayer() : _fill_func(), _impl()
{
}
-void NEMaxUnpoolingLayer::configure(ITensor *input, ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info)
+void NEMaxUnpoolingLayer::configure(ITensor *input,
+ ITensor *indices,
+ ITensor *output,
+ const PoolingLayerInfo &pool_info)
{
+ ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info);
+
const PixelValue zero_value(0.f);
- _fill_func = std::make_unique<NEFill>();
- _unpooling_layer_kernel = std::make_unique<NEMaxUnpoolingLayerKernel>();
+ _fill_func = std::make_unique<NEFill>();
+ _impl = std::make_unique<Impl>();
+ _impl->src = input;
+ _impl->indices = indices;
+ _impl->dst = output;
+
+ _impl->op = std::make_unique<cpu::CpuMaxUnpooling>();
_fill_func->configure(output, zero_value);
- _unpooling_layer_kernel->configure(input, indices, output, pool_info);
+ _impl->op->configure(input->info(), indices->info(), output->info(), pool_info);
}
-Status NEMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status NEMaxUnpoolingLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *indices,
+ const ITensorInfo *output,
+ const PoolingLayerInfo &pool_info)
{
- return NEMaxUnpoolingLayerKernel::validate(input, indices, output, pool_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices);
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuMaxUnpooling::validate(input, indices, output, pool_info));
+ return Status{};
}
void NEMaxUnpoolingLayer::run()
{
+ ITensorPack pack;
+ pack.add_tensor(TensorType::ACL_SRC_0, _impl->src);
+ pack.add_tensor(TensorType::ACL_SRC_1, _impl->indices);
+ pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
_fill_func->run();
- NEScheduler::get().schedule(_unpooling_layer_kernel.get(), Window::DimY);
+ _impl->op->run(pack);
}
} /* namespace arm_compute */
diff --git a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
index 02de983b77..dec0dde56d 100644
--- a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h"
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
namespace arm_compute
@@ -31,6 +32,8 @@ NEMeanStdDevNormalizationLayer::~NEMeanStdDevNormalizationLayer() = default;
void NEMeanStdDevNormalizationLayer::configure(ITensor *input, ITensor *output, float epsilon)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, epsilon);
+
auto k = std::make_unique<NEMeanStdDevNormalizationKernel>();
k->configure(input, output, epsilon);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index 9dcb157c03..d6d2e9dc46 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,6 +29,8 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NENormalizationLayerKernel.h"
namespace arm_compute
@@ -43,6 +45,7 @@ NENormalizationLayer::NENormalizationLayer(std::shared_ptr<IMemoryManager> memor
void NENormalizationLayer::configure(const ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_LOG_PARAMS(input, output, norm_info);
TensorInfo tensor_info(input->info()->tensor_shape(), 1, input->info()->data_type());
_input_squared.allocator()->init(tensor_info);
@@ -59,13 +62,16 @@ void NENormalizationLayer::configure(const ITensor *input, ITensor *output, cons
_input_squared.allocator()->allocate();
}
-Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+Status NENormalizationLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const NormalizationLayerInfo &norm_info)
{
// Perform validation step
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ON_ERROR(NENormalizationLayerKernel::validate(input, input, output, norm_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
return Status{};
}
@@ -76,4 +82,4 @@ void NENormalizationLayer::run()
_multiply_f.run();
NEScheduler::get().schedule(_norm_kernel.get(), Window::DimY);
}
-} \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPReluLayer.cpp b/src/runtime/NEON/functions/NEPReluLayer.cpp
index a05b545e9a..963e68bac7 100644
--- a/src/runtime/NEON/functions/NEPReluLayer.cpp
+++ b/src/runtime/NEON/functions/NEPReluLayer.cpp
@@ -24,7 +24,8 @@
#include "arm_compute/runtime/NEON/functions/NEPReluLayer.h"
#include "arm_compute/core/ITensor.h"
-#include "src/runtime/cpu/operators/CpuPRelu.h"
+
+#include "src/cpu/operators/CpuPRelu.h"
namespace arm_compute
{
@@ -32,17 +33,16 @@ using OperatorType = cpu::CpuPRelu;
struct NEPReluLayer::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<OperatorType> op{ nullptr };
+ const ITensor *src_0{nullptr};
+ const ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<OperatorType> op{nullptr};
};
-NEPReluLayer::NEPReluLayer()
- : _impl(std::make_unique<Impl>())
+NEPReluLayer::NEPReluLayer() : _impl(std::make_unique<Impl>())
{
}
-NEPReluLayer::NEPReluLayer(NEPReluLayer &&) = default;
+NEPReluLayer::NEPReluLayer(NEPReluLayer &&) = default;
NEPReluLayer &NEPReluLayer::operator=(NEPReluLayer &&) = default;
NEPReluLayer::~NEPReluLayer() = default;
diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
index 531b06de64..253566df0f 100644
--- a/src/runtime/NEON/functions/NEPadLayer.cpp
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp
@@ -23,12 +23,13 @@
*/
#include "arm_compute/runtime/NEON/functions/NEPadLayer.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/kernels/NEPadLayerKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/kernels/NEPadLayerKernel.h"
namespace arm_compute
{
@@ -37,9 +38,9 @@ namespace
uint32_t last_padding_dimension(const PaddingList &padding)
{
int last_padding_dim = padding.size() - 1;
- for(; last_padding_dim >= 0; --last_padding_dim)
+ for (; last_padding_dim >= 0; --last_padding_dim)
{
- if(padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0)
+ if (padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0)
{
break;
}
@@ -51,11 +52,22 @@ uint32_t last_padding_dimension(const PaddingList &padding)
NEPadLayer::~NEPadLayer() = default;
NEPadLayer::NEPadLayer()
- : _copy_function(), _pad_kernel(), _mode(), _padding(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results()
+ : _copy_function(),
+ _pad_kernel(),
+ _mode(),
+ _padding(),
+ _num_dimensions(0),
+ _slice_functions(),
+ _concat_functions(),
+ _slice_results(),
+ _concat_results()
{
}
-void NEPadLayer::configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value)
+void NEPadLayer::configure_constant_mode(ITensor *input,
+ ITensor *output,
+ const PaddingList &padding,
+ const PixelValue constant_value)
{
_pad_kernel = std::make_unique<NEPadLayerKernel>();
_pad_kernel->configure(input, output, padding, constant_value, PaddingMode::CONSTANT);
@@ -84,20 +96,20 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
Coordinates ends_after{};
Coordinates strides{};
ITensor *prev = input;
- for(uint32_t i = 0; i < _num_dimensions; ++i)
+ for (uint32_t i = 0; i < _num_dimensions; ++i)
{
// Values in strides from the previous dimensions need to be set to 1 to avoid reversing again.
- if(i > 0)
+ if (i > 0)
{
strides.set(i - 1, 1);
}
- if(_padding[i].first > 0 || _padding[i].second > 0)
+ if (_padding[i].first > 0 || _padding[i].second > 0)
{
// Set the starts, ends, and strides values for the current dimension.
// Due to the bit masks passed to strided slice, the values below the current dimension in
// starts and ends will be ignored so do not need to be modified.
- if(_mode == PaddingMode::REFLECT)
+ if (_mode == PaddingMode::REFLECT)
{
starts_before.set(i, _padding[i].first);
ends_before.set(i, 0);
@@ -123,11 +135,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
// Reflect the input values for the padding before and after the input.
std::vector<const ITensor *> concat_vector;
- if(_padding[i].first > 0)
+ if (_padding[i].first > 0)
{
- if(i < prev->info()->num_dimensions())
+ if (i < prev->info()->num_dimensions())
{
- _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, begin_mask_before, end_mask_before);
+ _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides,
+ begin_mask_before, end_mask_before);
concat_vector.emplace_back(&_slice_results[2 * i]);
}
else
@@ -137,11 +150,12 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
}
}
concat_vector.push_back(prev);
- if(_padding[i].second > 0)
+ if (_padding[i].second > 0)
{
- if(i < prev->info()->num_dimensions())
+ if (i < prev->info()->num_dimensions())
{
- _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, strides, begin_mask_after, end_mask_after);
+ _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after,
+ strides, begin_mask_after, end_mask_after);
concat_vector.emplace_back(&_slice_results[2 * i + 1]);
}
else
@@ -152,8 +166,13 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
}
// Concatenate the padding before and after with the input.
ITensor *out = (i == _num_dimensions - 1) ? output : &_concat_results[i];
+ out->info()->set_quantization_info(output->info()->quantization_info());
+ for (auto &v : concat_vector)
+ {
+ v->info()->set_quantization_info(input->info()->quantization_info());
+ }
_concat_functions[i].configure(concat_vector, out, i);
- if(i != _num_dimensions - 1)
+ if (i != _num_dimensions - 1)
{
_concat_results[i].allocator()->allocate();
}
@@ -164,22 +183,28 @@ void NEPadLayer::configure_reflect_symmetric_mode(ITensor *input, ITensor *outpu
}
}
-void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+void NEPadLayer::configure(ITensor *input,
+ ITensor *output,
+ const PaddingList &padding,
+ const PixelValue constant_value,
+ const PaddingMode mode)
{
ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
+ ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode);
_padding = padding;
_mode = mode;
- const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
+ const TensorShape padded_shape =
+ misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(padded_shape));
// Find the last dimension requiring padding so that it is known when to write to output and whether any padding is applied.
_num_dimensions = last_padding_dimension(padding) + 1;
- if(_num_dimensions > 0)
+ if (_num_dimensions > 0)
{
- switch(_mode)
+ switch (_mode)
{
case PaddingMode::CONSTANT:
{
@@ -203,19 +228,23 @@ void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &p
}
}
-Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+Status NEPadLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PaddingList &padding,
+ const PixelValue constant_value,
+ const PaddingMode mode)
{
ARM_COMPUTE_UNUSED(constant_value);
const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
- if(output->total_size() > 0)
+ if (output->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
- switch(mode)
+ switch (mode)
{
case PaddingMode::CONSTANT:
{
@@ -224,9 +253,9 @@ Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
case PaddingMode::REFLECT:
case PaddingMode::SYMMETRIC:
{
- for(uint32_t i = 0; i < padding.size(); ++i)
+ for (uint32_t i = 0; i < padding.size(); ++i)
{
- if(mode == PaddingMode::REFLECT)
+ if (mode == PaddingMode::REFLECT)
{
ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first >= input->dimension(i));
ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second >= input->dimension(i));
@@ -249,9 +278,9 @@ Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
void NEPadLayer::run()
{
- if(_num_dimensions > 0)
+ if (_num_dimensions > 0)
{
- switch(_mode)
+ switch (_mode)
{
case PaddingMode::CONSTANT:
{
@@ -261,15 +290,15 @@ void NEPadLayer::run()
case PaddingMode::REFLECT:
case PaddingMode::SYMMETRIC:
{
- for(uint32_t i = 0; i < _num_dimensions; ++i)
+ for (uint32_t i = 0; i < _num_dimensions; ++i)
{
- if(_padding[i].first > 0 || _padding[i].second > 0)
+ if (_padding[i].first > 0 || _padding[i].second > 0)
{
- if(_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
+ if (_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
{
_slice_functions[2 * i].run();
}
- if(_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
+ if (_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
{
_slice_functions[2 * i + 1].run();
}
diff --git a/src/runtime/NEON/functions/NEPermute.cpp b/src/runtime/NEON/functions/NEPermute.cpp
index f707fad757..80cd04ce6c 100644
--- a/src/runtime/NEON/functions/NEPermute.cpp
+++ b/src/runtime/NEON/functions/NEPermute.cpp
@@ -24,19 +24,19 @@
#include "arm_compute/runtime/NEON/functions/NEPermute.h"
#include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuPermute.h"
+
+#include "src/cpu/operators/CpuPermute.h"
namespace arm_compute
{
struct NEPermute::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuPermute> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuPermute> op{nullptr};
};
-NEPermute::NEPermute()
- : _impl(std::make_unique<Impl>())
+NEPermute::NEPermute() : _impl(std::make_unique<Impl>())
{
}
diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index 3a2f1984b4..97155a9e74 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
@@ -24,7 +24,8 @@
#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
#include "arm_compute/core/ITensor.h"
-#include "src/runtime/cpu/operators/CpuMul.h"
+
+#include "src/cpu/operators/CpuMul.h"
#include <utility>
@@ -32,32 +33,42 @@ namespace arm_compute
{
struct NEPixelWiseMultiplication::Impl
{
- const ITensor *src_0{ nullptr };
- const ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuMul> op{ nullptr };
+ const ITensor *src_0{nullptr};
+ const ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuMul> op{nullptr};
};
-NEPixelWiseMultiplication::NEPixelWiseMultiplication()
- : _impl(std::make_unique<Impl>())
+NEPixelWiseMultiplication::NEPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
{
}
NEPixelWiseMultiplication::~NEPixelWiseMultiplication() = default;
-Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
const ActivationLayerInfo &act_info)
{
return cpu::CpuMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
}
-void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+void NEPixelWiseMultiplication::configure(const ITensor *input1,
+ const ITensor *input2,
+ ITensor *output,
+ float scale,
+ ConvertPolicy overflow_policy,
+ RoundingPolicy rounding_policy,
const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
_impl->src_1 = input2;
_impl->dst = output;
_impl->op = std::make_unique<cpu::CpuMul>();
- _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info);
+ _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy,
+ act_info);
}
void NEPixelWiseMultiplication::run()
@@ -71,24 +82,29 @@ void NEPixelWiseMultiplication::run()
struct NEComplexPixelWiseMultiplication::Impl
{
- ITensor *src_0{ nullptr };
- ITensor *src_1{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuComplexMul> op{ nullptr };
+ ITensor *src_0{nullptr};
+ ITensor *src_1{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuComplexMul> op{nullptr};
};
-NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication()
- : _impl(std::make_unique<Impl>())
+NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
{
}
NEComplexPixelWiseMultiplication::~NEComplexPixelWiseMultiplication() = default;
-Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &act_info)
{
return cpu::CpuComplexMul::validate(input1, input2, output, act_info);
}
-void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEComplexPixelWiseMultiplication::configure(ITensor *input1,
+ ITensor *input2,
+ ITensor *output,
+ const ActivationLayerInfo &act_info)
{
_impl->src_0 = input1;
_impl->src_1 = input2;
diff --git a/src/runtime/NEON/functions/NEPooling3dLayer.cpp b/src/runtime/NEON/functions/NEPooling3dLayer.cpp
new file mode 100644
index 0000000000..e017e8c21d
--- /dev/null
+++ b/src/runtime/NEON/functions/NEPooling3dLayer.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEPooling3dLayer.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/cpu/operators/CpuPool3d.h"
+
+namespace arm_compute
+{
+struct NEPooling3dLayer::Impl
+{
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuPool3d> op{nullptr};
+ MemoryGroup memory_group{};
+ ITensorPack run_pack{};
+ WorkspaceData<Tensor> workspace_tensors{};
+};
+
+NEPooling3dLayer::~NEPooling3dLayer() = default;
+
+NEPooling3dLayer::NEPooling3dLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
+{
+ _impl->memory_group = MemoryGroup(std::move(memory_manager));
+}
+
+void NEPooling3dLayer::configure(const ITensor *input, ITensor *output, const Pooling3dLayerInfo &pool_info)
+{
+ _impl->src = input;
+ _impl->dst = output;
+ _impl->op = std::make_unique<cpu::CpuPool3d>();
+ _impl->op->configure(input->info(), output->info(), pool_info);
+
+ _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST_0, _impl->dst}};
+ _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
+}
+
+Status
+NEPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info)
+{
+ return cpu::CpuPool3d::validate(input, output, pool_info);
+}
+
+void NEPooling3dLayer::run()
+{
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->src, _impl->dst);
+ _impl->op->run(_impl->run_pack);
+}
+
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 8d267a32c0..eb9125be3c 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp
@@ -26,17 +26,18 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/Tensor.h"
+
#include "src/core/helpers/MemoryHelpers.h"
-#include "src/runtime/cpu/operators/CpuPool2d.h"
+#include "src/cpu/operators/CpuPool2d.h"
namespace arm_compute
{
struct NEPoolingLayer::Impl
{
- ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- ITensor *indices{ nullptr };
- std::unique_ptr<cpu::CpuPool2d> op{ nullptr };
+ ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ ITensor *indices{nullptr};
+ std::unique_ptr<cpu::CpuPool2d> op{nullptr};
MemoryGroup memory_group{};
ITensorPack run_pack{};
WorkspaceData<Tensor> workspace_tensors{};
@@ -44,8 +45,7 @@ struct NEPoolingLayer::Impl
NEPoolingLayer::~NEPoolingLayer() = default;
-NEPoolingLayer::NEPoolingLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _impl(std::make_unique<Impl>())
+NEPoolingLayer::NEPoolingLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
{
_impl->memory_group = MemoryGroup(std::move(memory_manager));
}
@@ -58,11 +58,16 @@ void NEPoolingLayer::configure(ITensor *input, ITensor *output, const PoolingLay
_impl->op = std::make_unique<cpu::CpuPool2d>();
_impl->op->configure(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr);
- _impl->run_pack = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST_0, _impl->dst }, { TensorType::ACL_DST_1, _impl->indices } };
+ _impl->run_pack = {{TensorType::ACL_SRC, _impl->src},
+ {TensorType::ACL_DST_0, _impl->dst},
+ {TensorType::ACL_DST_1, _impl->indices}};
_impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
}
-Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status NEPoolingLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const PoolingLayerInfo &pool_info,
+ const ITensorInfo *indices)
{
return cpu::CpuPool2d::validate(input, output, pool_info, indices);
}
diff --git a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
index 0c71706586..dbb6bf9df1 100644
--- a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
+++ b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,21 +27,31 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h"
namespace arm_compute
{
-void NEPriorBoxLayer::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info)
+void NEPriorBoxLayer::configure(const ITensor *input1,
+ const ITensor *input2,
+ ITensor *output,
+ const PriorBoxLayerInfo &info)
{
+ ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info);
+
auto k = std::make_unique<NEPriorBoxLayerKernel>();
k->configure(input1, input2, output, info);
_kernel = std::move(k);
}
-Status NEPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status NEPriorBoxLayer::validate(const ITensorInfo *input1,
+ const ITensorInfo *input2,
+ const ITensorInfo *output,
+ const PriorBoxLayerInfo &info)
{
return NEPriorBoxLayerKernel::validate(input1, input2, output, info);
}
diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
index 85d62ac058..dd78d10d16 100644
--- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,34 +23,38 @@
*/
#include "arm_compute/runtime/NEON/functions/NEQLSTMLayer.h"
+#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/KernelDescriptors.h"
#include "arm_compute/core/QuantizationInfo.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/InfoHelpers.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
+#include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"
namespace arm_compute
{
using namespace arm_compute::utils::info_helpers;
namespace
{
-Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias,
- float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info)
+Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info,
+ const ITensorInfo *mm_input,
+ const ITensorInfo *mm_weights,
+ const ITensorInfo *bias,
+ float gemmlowp_scale,
+ const TensorInfo *mm_res_info,
+ const TensorInfo *outstage_tensor_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info));
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+ gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
return Status{};
}
} // namespace
@@ -59,10 +63,7 @@ Status NEQLSTMLayer::validate_layer_norm(const ITensorInfo &in, const ITensorInf
{
// Output quantization scale will be different, but ignored here
// since it will be configured at configure() stage.
- const TensorInfo out
- {
- in
- };
+ const TensorInfo out{in};
return NEQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
}
@@ -92,6 +93,8 @@ Status NEQLSTMLayer::TensorCopyKernel::validate(const ITensorInfo &src, const IT
void NEQLSTMLayer::TensorCopyKernel::configure(ITensor &src, ITensor &dst)
{
ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::TensorCopyKernel::validate(*src.info(), *dst.info()));
+ ARM_COMPUTE_LOG_PARAMS(src, dst);
+
_src = &src;
_dst = &dst;
_row_size = std::min(_src->info()->tensor_shape().x(), _dst->info()->tensor_shape().x());
@@ -100,39 +103,108 @@ void NEQLSTMLayer::TensorCopyKernel::configure(ITensor &src, ITensor &dst)
void NEQLSTMLayer::TensorCopyKernel::run()
{
- Iterator input_iter{ _src, _window };
- Iterator output_iter{ _dst, _window };
+ Iterator input_iter{_src, _window};
+ Iterator output_iter{_dst, _window};
- execute_window_loop(_window, [&](const Coordinates &)
- {
- memcpy(output_iter.ptr(), input_iter.ptr(), _row_size);
- },
- input_iter, output_iter);
+ execute_window_loop(
+ _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter,
+ output_iter);
}
NEQLSTMLayer::~NEQLSTMLayer() = default;
NEQLSTMLayer::NEQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(), _transpose_input_to_forget_weights(), _transpose_input_to_cell_weights(), _transpose_input_to_output_weights(), _transpose_input_to_input_weights(),
- _transpose_recurrent_to_forget_weights(), _transpose_recurrent_to_cell_weights(), _transpose_recurrent_to_output_weights(), _transpose_recurrent_to_input_weights(), _transpose_projection_weights(),
- _input_to_input_reduction(), _recurrent_to_input_reduction(), _input_to_forget_reduction(), _recurrent_to_forget_reduction(), _input_to_cell_reduction(), _recurrent_to_cell_reduction(),
- _input_to_output_reduction(), _recurrent_to_output_reduction(), _projection_reduction(), _projection_bias_add(), _mm_input_to_forget(), _mm_recurrent_to_forget(), _pixelwise_mul_cell_to_forget(),
- _input_to_forget_outstage(), _recurrent_to_forget_outstage(), _cell_to_forget_outstage(), _accumulate_input_recurrent_forget(), _accumulate_cell_forget(), _forget_gate_sigmoid(), _mm_input_to_cell(),
- _input_to_cell_outstage(), _mm_recurrent_to_cell(), _recurrent_to_cell_outstage(), _accumulate_input_recurrent_modulation(), _cell_gate_tanh(), _input_gate_sub(), _mm_input_to_input(),
- _input_to_input_outstage(), _mm_recurrent_to_input(), _recurrent_to_input_outstage(), _accumulate_input_recurrent_input(), _pixelwise_mul_cell_to_input(), _cell_to_input_outstage(),
- _accumulate_cell_input(), _input_gate_sigmoid(), _pixelwise_mul_forget_cell(), _pixelwise_mul_input_cell(), _add_forget_cell(), _cell_clip(), _mm_input_to_output(), _input_to_output_outstage(),
- _mm_recurrent_to_output(), _recurrent_to_output_outstage(), _accumulate_input_recurrent_output(), _pixelwise_mul_cell_to_output(), _cell_to_output_outstage(), _accumulate_cell_to_output(),
- _output_gate_sigmoid(), _hidden_tanh(), _pixelwise_mul_hidden(), _hidden_outstage(), _mm_projection(), _projection_outstage(), _accumulate_projection(), _projection_clip(), _projection_bias_copy(),
- _projection_output_to_accumulate_copy(), _projection_accumulate_to_output_copy(), _hidden_to_output_copy(), _layer_norms(), _copy_output(), _layer_norm_weights(), _layer_norm_bias(),
+ : _memory_group(),
+ _dequantize_input_to_forget_weights(),
+ _quantize_input_to_forget_weights(),
+ _transpose_input_to_forget_weights(),
+ _transpose_input_to_cell_weights(),
+ _transpose_input_to_output_weights(),
+ _transpose_input_to_input_weights(),
+ _transpose_recurrent_to_forget_weights(),
+ _transpose_recurrent_to_cell_weights(),
+ _transpose_recurrent_to_output_weights(),
+ _transpose_recurrent_to_input_weights(),
+ _transpose_projection_weights(),
+ _input_to_input_reduction(),
+ _recurrent_to_input_reduction(),
+ _input_to_forget_reduction(),
+ _recurrent_to_forget_reduction(),
+ _input_to_cell_reduction(),
+ _recurrent_to_cell_reduction(),
+ _input_to_output_reduction(),
+ _recurrent_to_output_reduction(),
+ _projection_reduction(),
+ _projection_bias_add(),
+ _mm_input_to_forget(),
+ _mm_recurrent_to_forget(),
+ _pixelwise_mul_cell_to_forget(),
+ _input_to_forget_outstage(),
+ _recurrent_to_forget_outstage(),
+ _cell_to_forget_outstage(),
+ _accumulate_input_recurrent_forget(),
+ _accumulate_cell_forget(),
+ _forget_gate_sigmoid(),
+ _mm_input_to_cell(),
+ _input_to_cell_outstage(),
+ _mm_recurrent_to_cell(),
+ _recurrent_to_cell_outstage(),
+ _accumulate_input_recurrent_modulation(),
+ _cell_gate_tanh(),
+ _input_gate_sub(),
+ _mm_input_to_input(),
+ _input_to_input_outstage(),
+ _mm_recurrent_to_input(),
+ _recurrent_to_input_outstage(),
+ _accumulate_input_recurrent_input(),
+ _pixelwise_mul_cell_to_input(),
+ _cell_to_input_outstage(),
+ _accumulate_cell_input(),
+ _input_gate_sigmoid(),
+ _pixelwise_mul_forget_cell(),
+ _pixelwise_mul_input_cell(),
+ _add_forget_cell(),
+ _cell_clip(),
+ _mm_input_to_output(),
+ _input_to_output_outstage(),
+ _mm_recurrent_to_output(),
+ _recurrent_to_output_outstage(),
+ _accumulate_input_recurrent_output(),
+ _pixelwise_mul_cell_to_output(),
+ _cell_to_output_outstage(),
+ _accumulate_cell_to_output(),
+ _output_gate_sigmoid(),
+ _hidden_tanh(),
+ _pixelwise_mul_hidden(),
+ _hidden_outstage(),
+ _mm_projection(),
+ _projection_outstage(),
+ _accumulate_projection(),
+ _projection_clip(),
+ _projection_bias_copy(),
+ _projection_output_to_accumulate_copy(),
+ _projection_accumulate_to_output_copy(),
+ _hidden_to_output_copy(),
+ _layer_norms(),
+ _copy_output(),
+ _layer_norm_weights(),
+ _layer_norm_bias(),
_layer_norm_output()
{
_memory_group = MemoryGroup(std::move(memory_manager));
}
-void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
- const ITensor *mm_input, const ITensor *mm_weights, const ITensor *bias,
- Tensor *mm_res, Tensor *outstage_res, float gemmlowp_scale,
- const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info)
+void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm,
+ NEGEMMLowpOutputStage &outstage,
+ GEMMLowpOutputStageInfo &gemmlowp_info,
+ const ITensor *mm_input,
+ const ITensor *mm_weights,
+ const ITensor *bias,
+ Tensor *mm_res,
+ Tensor *outstage_res,
+ float gemmlowp_scale,
+ const TensorInfo &mm_res_info,
+ const TensorInfo &outstage_tensor_info)
{
_memory_group.manage(mm_res);
_memory_group.manage(outstage_res);
@@ -144,33 +216,88 @@ void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutp
mm.configure(mm_input, mm_weights, nullptr, mm_res);
// Configure output stage
- quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
+ quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift);
outstage.configure(mm_res, bias, outstage_res, gemmlowp_info);
mm_res->allocator()->allocate();
}
-void NEQLSTMLayer::configure(const ITensor *input,
- const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
- const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
- const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
- const ITensor *cell_state_in, ITensor *output_state_in,
- ITensor *cell_state_out, ITensor *output_state_out, ITensor *output,
+void NEQLSTMLayer::configure(const ITensor *input,
+ const ITensor *input_to_forget_weights,
+ const ITensor *input_to_cell_weights,
+ const ITensor *input_to_output_weights,
+ const ITensor *recurrent_to_forget_weights,
+ const ITensor *recurrent_to_cell_weights,
+ const ITensor *recurrent_to_output_weights,
+ const ITensor *forget_gate_bias,
+ const ITensor *cell_bias,
+ const ITensor *output_gate_bias,
+ const ITensor *cell_state_in,
+ ITensor *output_state_in,
+ ITensor *cell_state_out,
+ ITensor *output_state_out,
+ ITensor *output,
const LSTMParams<ITensor> &lstm_params)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
- forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
+ forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+ cell_state_out, output_state_out);
+
+ ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+ forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+ cell_state_out, output_state_out);
// Set lstm parameters
LSTMParams<ITensorInfo> lstm_params_info{};
build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
- // Validate
- ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
- recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
- forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
- cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(),
- lstm_params_info));
+ _input_to_forget_weights_transposed.info()->set_quantization_info(
+ input_to_forget_weights->info()->quantization_info());
+ _input_to_cell_weights_transposed.info()->set_quantization_info(input_to_cell_weights->info()->quantization_info());
+ _input_to_output_weights_transposed.info()->set_quantization_info(
+ input_to_output_weights->info()->quantization_info());
+ _recurrent_to_forget_weights_transposed.info()->set_quantization_info(
+ recurrent_to_forget_weights->info()->quantization_info());
+ _recurrent_to_cell_weights_transposed.info()->set_quantization_info(
+ recurrent_to_cell_weights->info()->quantization_info());
+ _recurrent_to_output_weights_transposed.info()->set_quantization_info(
+ recurrent_to_output_weights->info()->quantization_info());
+
+ if (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED)
+ {
+ _convert_input_to_forget_weights_to_qsymm8 = true;
+ // Setup dequantize output tensor to go from QASYMM8_SIGNED -> F32
+
+ _input_to_forget_weights_f32.allocator()->init(
+ TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::F32)
+ .set_data_layout(input_to_forget_weights->info()->data_layout()));
+ // Setup the quantize output tensor to go from F32 -> QSYMM8
+ _input_to_forget_weights_symm8.allocator()->init(
+ (TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::QSYMM8)
+ .set_data_layout(input_to_forget_weights->info()->data_layout())
+ .set_quantization_info(input_to_forget_weights->info()->quantization_info())));
+
+ _dequantize_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_f32);
+ _quantize_input_to_forget_weights.configure(&_input_to_forget_weights_f32, &_input_to_forget_weights_symm8);
+
+ ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(
+ input->info(), _input_to_forget_weights_symm8.info(), input_to_cell_weights->info(),
+ input_to_output_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(),
+ recurrent_to_output_weights->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
+ cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(),
+ output->info(), lstm_params_info));
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(
+ input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+ input_to_output_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(),
+ recurrent_to_output_weights->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
+ cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(),
+ output->info(), lstm_params_info));
+ }
const int batch_size = input->info()->dimension(1);
const int num_units = input_to_output_weights->info()->dimension(1);
@@ -181,7 +308,9 @@ void NEQLSTMLayer::configure(const ITensor *input,
const UniformQuantizationInfo qoutput_state_in = output_state_in->info()->quantization_info().uniform();
_projection_bias = lstm_params.projection_bias();
- _input_to_forget_weights = input_to_forget_weights;
+ _input_to_forget_weights = (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED)
+ ? &_input_to_forget_weights_symm8
+ : input_to_forget_weights;
_input_to_cell_weights = input_to_cell_weights;
_input_to_output_weights = input_to_output_weights;
_recurrent_to_forget_weights = recurrent_to_forget_weights;
@@ -191,7 +320,7 @@ void NEQLSTMLayer::configure(const ITensor *input,
// Layer normalization
_has_layer_norm = lstm_params.use_layer_norm();
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget);
set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell);
@@ -213,44 +342,59 @@ void NEQLSTMLayer::configure(const ITensor *input,
// Calculate quantized parameters for clipping.
int16_t quantized_cell_clip = 0;
- if(lstm_params.cell_clip() > 0.0f)
+ if (lstm_params.cell_clip() > 0.0f)
{
quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
}
_has_cell_clipping = quantized_cell_clip > 0;
// Precompute effective bias for optimizing the matmul computations.
- if(!_has_cifg)
+ if (!_has_cifg)
{
_input_to_input_weights = lstm_params.input_to_input_weights();
_recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
- _input_to_input_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _recurrent_to_input_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _input_to_input_reduction->configure(_input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
- _recurrent_to_input_reduction->configure(_recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ _input_to_input_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _recurrent_to_input_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _input_to_input_reduction->configure(_input_to_input_weights->info(), _input_to_input_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_input_reduction->configure(
+ _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
}
- _input_to_forget_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _recurrent_to_forget_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _input_to_cell_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _recurrent_to_cell_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _input_to_output_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _recurrent_to_output_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
-
- _recurrent_to_cell_reduction->configure(input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
- _recurrent_to_forget_reduction->configure(recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
- _input_to_cell_reduction->configure(input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
- _recurrent_to_cell_reduction->configure(recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
- _input_to_output_reduction->configure(input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
- _recurrent_to_output_reduction->configure(recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
- if(_has_projection)
+ _input_to_forget_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _recurrent_to_forget_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _input_to_cell_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _recurrent_to_cell_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _input_to_output_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _recurrent_to_output_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+
+ _input_to_forget_reduction->configure(input_to_forget_weights->info(), _input_to_forget_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_forget_reduction->configure(
+ recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ _input_to_cell_reduction->configure(input_to_cell_weights->info(), _input_to_cell_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_cell_reduction->configure(
+ recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ _input_to_output_reduction->configure(input_to_output_weights->info(), _input_to_output_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+ _recurrent_to_output_reduction->configure(
+ recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+ if (_has_projection)
{
- _projection_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
- _projection_reduction->configure(_projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
- if(_projection_bias != nullptr)
+ _projection_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+ _projection_reduction->configure(
+ _projection_weights->info(), _projection_eff_bias.info(),
+ GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+ if (_projection_bias != nullptr)
{
- _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE);
+ _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias,
+ ConvertPolicy::SATURATE);
}
}
@@ -258,15 +402,19 @@ void NEQLSTMLayer::configure(const ITensor *input,
_transpose_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_transposed);
_transpose_input_to_cell_weights.configure(input_to_cell_weights, &_input_to_cell_weights_transposed);
_transpose_input_to_output_weights.configure(input_to_output_weights, &_input_to_output_weights_transposed);
- _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed);
+ _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights,
+ &_recurrent_to_forget_weights_transposed);
_transpose_recurrent_to_cell_weights.configure(recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed);
- _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights, &_recurrent_to_output_weights_transposed);
- if(!_has_cifg)
+ _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights,
+ &_recurrent_to_output_weights_transposed);
+ if (!_has_cifg)
{
- _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed);
- _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed);
+ _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(),
+ &_input_to_input_weights_transposed);
+ _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(),
+ &_recurrent_to_input_weights_transposed);
}
- if(_has_projection)
+ if (_has_projection)
{
_transpose_projection_weights.configure(_projection_weights, &_projection_weights_transposed);
}
@@ -279,40 +427,52 @@ void NEQLSTMLayer::configure(const ITensor *input,
const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
// Forget gate.
- const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
- const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
- configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info,
- input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias,
- &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale,
- mm_out_info, forget_gate_outstage_info);
-
- const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
- configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info,
- output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias,
- &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale,
- mm_out_info, forget_gate_outstage_info);
-
- _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
+ const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+ const float input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale *
+ qinput.scale / lstm_params.forget_intermediate_scale();
+ configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input,
+ &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res,
+ &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info);
+
+ const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+ configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, output_state_in,
+ &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias, &_mm_recurrent_to_forget_res,
+ &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale, mm_out_info, forget_gate_outstage_info);
+
+ _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+ &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
_input_to_forget_outstage_res.allocator()->allocate();
- if(_has_peephole)
+ if (_has_peephole)
{
_mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
_memory_group.manage(&_mul_cell_to_forget_res);
- _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
- _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
+ _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(),
+ &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
+ _cell_to_forget_outstage_res.allocator()->init(
+ TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
_memory_group.manage(&_cell_to_forget_outstage_res);
- const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
- quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
- _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info);
+ const float cell_to_forget_scale =
+ std::pow(2, cell_shift) *
+ lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale /
+ lstm_params.forget_intermediate_scale();
+ quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift);
+ _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res,
+ gemmlowp_info);
_mul_cell_to_forget_res.allocator()->allocate();
- _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
+ _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res,
+ &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
_cell_to_forget_outstage_res.allocator()->allocate();
}
Tensor *forget_activation_input = &_recurrent_to_forget_outstage_res;
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
configure_layer_norm(LayerNormGate::Forget, forget_activation_input);
forget_activation_input->allocator()->allocate();
@@ -321,33 +481,36 @@ void NEQLSTMLayer::configure(const ITensor *input,
// Output quantization info of Sigmoid and Tanh activations
const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
- const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
+ const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
_memory_group.manage(&_forget_gate);
_forget_gate.allocator()->init(forget_gate_info);
- _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
forget_activation_input->allocator()->allocate();
// Modulation gate.
- const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
- const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
- configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info,
- input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias,
- &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,
+ const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+ const float input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale *
+ qinput.scale / lstm_params.cell_intermediate_scale();
+ configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input, &_input_to_cell_weights_transposed,
+ &_input_to_cell_eff_bias, &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,
mm_out_info, cell_outstage_info);
- const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
- configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info,
- output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias,
- &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale,
- mm_out_info, cell_outstage_info);
+ const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+ configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in,
+ &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res,
+ &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info);
- _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE);
+ _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
+ &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE);
_input_to_cell_outstage_res.allocator()->allocate();
Tensor *cell_activation_input = &_recurrent_to_cell_outstage_res;
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
configure_layer_norm(LayerNormGate::Cell, cell_activation_input);
cell_activation_input->allocator()->allocate();
@@ -358,14 +521,15 @@ void NEQLSTMLayer::configure(const ITensor *input,
_memory_group.manage(&_cell_gate);
_cell_gate.allocator()->init(cell_gate_info);
- _cell_gate_tanh.configure(cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+ _cell_gate_tanh.configure(cell_activation_input, &_cell_gate,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
cell_activation_input->allocator()->allocate();
// Input gate.
const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
_input_gate.allocator()->init(input_gate_info);
_memory_group.manage(&_input_gate);
- if(_has_cifg)
+ if (_has_cifg)
{
_ones.allocator()->init(*_forget_gate.info());
_input_gate_sub.configure(&_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);
@@ -373,104 +537,137 @@ void NEQLSTMLayer::configure(const ITensor *input,
}
else
{
- const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
- const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
- configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info,
- input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias,
- &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale,
- mm_out_info, input_outstage_info);
-
- const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
- configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info,
- output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
+ const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+ const float input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale *
+ qinput.scale / lstm_params.input_intermediate_scale();
+ configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input,
+ &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res,
+ &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info);
+
+ const float recurrent_to_input_scale =
+ _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale /
+ lstm_params.input_intermediate_scale();
+ configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, output_state_in,
+ &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
&_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale,
mm_out_info, input_outstage_info);
- _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
+ _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res,
+ &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
_input_to_input_outstage_res.allocator()->allocate();
- if(_has_peephole)
+ if (_has_peephole)
{
- _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
+ _mul_cell_to_input_res.allocator()->init(
+ TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
_memory_group.manage(&_mul_cell_to_input_res);
- _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
- const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
- quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
- _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
+ _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(),
+ &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
+ const float cell_to_input_scale =
+ std::pow(2, cell_shift) *
+ lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale /
+ lstm_params.input_intermediate_scale();
+ quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift);
+ _cell_to_input_outstage_res.allocator()->init(
+ TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
_memory_group.manage(&_cell_to_input_outstage_res);
- _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info);
+ _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res,
+ gemmlowp_info);
_mul_cell_to_input_res.allocator()->allocate();
- _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
+ _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res,
+ &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
_cell_to_input_outstage_res.allocator()->allocate();
}
Tensor *input_activation_input = &_recurrent_to_input_outstage_res;
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
configure_layer_norm(LayerNormGate::Input, input_activation_input);
input_activation_input->allocator()->allocate();
input_activation_input = &get_layer_norm_output(LayerNormGate::Input);
}
- _input_gate_sigmoid.configure(input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _input_gate_sigmoid.configure(input_activation_input, &_input_gate,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
input_activation_input->allocator()->allocate();
}
// Cell.
// TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
- _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
const float cell_gate_scale = _cell_gate.info()->quantization_info().uniform().scale;
const float mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);
- const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0));
+ const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(mul_input_cell_scale, 0));
_memory_group.manage(&_mul_input_cell_res);
_mul_input_cell_res.allocator()->init(mul_input_cell_info);
- _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
_cell_gate.allocator()->allocate();
_add_forget_cell.configure(&_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE);
_mul_input_cell_res.allocator()->allocate();
_forget_gate.allocator()->allocate();
- if(_has_cell_clipping)
+ if (_has_cell_clipping)
{
- _cell_clip.configure(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip));
+ _cell_clip.configure(cell_state_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ -quantized_cell_clip, quantized_cell_clip));
}
// Output gate.
- const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
- const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
- configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info,
- input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias,
- &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale,
- mm_out_info, output_outstage_info);
-
- const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
- configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info,
- output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias,
- &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale,
- mm_out_info, output_outstage_info);
-
- _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
+ const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+ const float input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale *
+ qinput.scale / lstm_params.output_intermediate_scale();
+ configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input,
+ &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res,
+ &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info);
+
+ const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+ configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, output_state_in,
+ &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias, &_mm_recurrent_to_output_res,
+ &_recurrent_to_output_outstage_res, recurrent_to_output_scale, mm_out_info, output_outstage_info);
+
+ _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res,
+ &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
_input_to_output_outstage_res.allocator()->allocate();
- if(_has_peephole)
+ if (_has_peephole)
{
// TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
// Here we are not using the output stage because all operations are done in float
_mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32));
_memory_group.manage(&_mul_cell_to_output_res);
- _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-
- const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
- quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
- _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
+ _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(),
+ &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
+
+ const float cell_to_output_scale =
+ std::pow(2, cell_shift) *
+ lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale /
+ lstm_params.output_intermediate_scale();
+ quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift);
+ _cell_to_output_outstage_res.allocator()->init(
+ TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
_memory_group.manage(&_cell_to_output_outstage_res);
- _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info);
+ _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res,
+ gemmlowp_info);
_mul_cell_to_output_res.allocator()->allocate();
- _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
+ _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res,
+ &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
_cell_to_output_outstage_res.allocator()->allocate();
}
Tensor *output_activation_input = &_recurrent_to_output_outstage_res;
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
configure_layer_norm(LayerNormGate::Output, output_activation_input);
output_activation_input->allocator()->allocate();
@@ -480,20 +677,24 @@ void NEQLSTMLayer::configure(const ITensor *input,
_memory_group.manage(&_output_gate);
_output_gate.allocator()->init(output_gate_info);
- _output_gate_sigmoid.configure(output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+ _output_gate_sigmoid.configure(output_activation_input, &_output_gate,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
output_activation_input->allocator()->allocate();
// Hidden.
- _hidden_tanh.configure(cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+ _hidden_tanh.configure(cell_state_out, &_input_gate,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
// TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
_memory_group.manage(&_hidden_mul_res);
const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);
_hidden_mul_res.allocator()->init(hidden_mul_res);
- _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+ _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO);
_output_gate.allocator()->allocate();
_input_gate.allocator()->allocate();
const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
- quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
+ quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();
gemmlowp_info.output_data_type = output_state_in->info()->data_type();
@@ -502,7 +703,7 @@ void NEQLSTMLayer::configure(const ITensor *input,
_memory_group.manage(&_hidden_gate);
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_hidden_gate.allocator()->init(*output_state_out->info());
_hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape());
@@ -513,27 +714,26 @@ void NEQLSTMLayer::configure(const ITensor *input,
_hidden_mul_res.allocator()->allocate();
// Projection.
- if(_has_projection)
+ if (_has_projection)
{
const TensorInfo projection_outstage_info(*output_state_out->info());
- const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform();
- const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
- gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
- gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
- gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
- gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED;
-
- TensorInfo projection_mm_out_info{ mm_out_info };
+ const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform();
+ const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+ gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
+ gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
+ gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
+ gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED;
+
+ TensorInfo projection_mm_out_info{mm_out_info};
projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
- configure_mm(_mm_projection, _projection_outstage, gemmlowp_info,
- hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias,
- &_mm_projection_res, &_projection_outstage_res, projection_scale,
- projection_mm_out_info, projection_outstage_info);
+ configure_mm(_mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result,
+ &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res,
+ &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info);
ITensor *accumulate_destination = output_state_out;
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_hidden_gate.allocator()->allocate();
_projection_accumulate_res.allocator()->init(*output_state_in->info());
@@ -542,30 +742,34 @@ void NEQLSTMLayer::configure(const ITensor *input,
accumulate_destination = &_projection_accumulate_res;
}
- _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE);
+ _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination,
+ ConvertPolicy::SATURATE);
_projection_outstage_res.allocator()->allocate();
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out);
_projection_accumulate_res.allocator()->allocate();
}
- int8_t quantized_projection_clip{ 0 };
- if(lstm_params.projection_clip() > 0.0f)
+ int8_t quantized_projection_clip{0};
+ if (lstm_params.projection_clip() > 0.0f)
{
- quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
+ quantized_projection_clip =
+ utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
}
- if(quantized_projection_clip > 0)
+ if (quantized_projection_clip > 0)
{
- _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, quantized_projection_clip));
+ _projection_clip.configure(output_state_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ -quantized_projection_clip, quantized_projection_clip));
_has_projection_clipping = true;
}
}
else
{
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_hidden_to_output_copy.configure(_hidden_gate, *output_state_out);
_hidden_gate.allocator()->allocate();
@@ -576,17 +780,27 @@ void NEQLSTMLayer::configure(const ITensor *input,
_copy_output.configure(output_state_out, output);
}
-Status NEQLSTMLayer::validate(const ITensorInfo *input,
- const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
- const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
- const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
- const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
- const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+Status NEQLSTMLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *input_to_forget_weights,
+ const ITensorInfo *input_to_cell_weights,
+ const ITensorInfo *input_to_output_weights,
+ const ITensorInfo *recurrent_to_forget_weights,
+ const ITensorInfo *recurrent_to_cell_weights,
+ const ITensorInfo *recurrent_to_output_weights,
+ const ITensorInfo *forget_gate_bias,
+ const ITensorInfo *cell_bias,
+ const ITensorInfo *output_gate_bias,
+ const ITensorInfo *cell_state_in,
+ const ITensorInfo *output_state_in,
+ const ITensorInfo *cell_state_out,
+ const ITensorInfo *output_state_out,
+ const ITensorInfo *output,
const LSTMParams<ITensorInfo> &lstm_params)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
- recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
- cell_state_out, output_state_out, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights,
+ recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+ cell_state_in, output_state_in, cell_state_out, output_state_out, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions");
@@ -598,14 +812,28 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);
ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights,
+ input_to_cell_weights);
ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);
ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
- recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QASYMM8_SIGNED,
+ DataType::QSYMM8);
+ // If the input_to_forget_weights data type is DataType::QSYMM8 then it can never match the other weights as they are all DataType::QASYMM8_SIGNED
+ if (input_to_forget_weights->data_type() == DataType::QSYMM8)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_cell_weights, input_to_output_weights,
+ recurrent_to_forget_weights, recurrent_to_cell_weights,
+ recurrent_to_output_weights);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights,
+ input_to_output_weights, recurrent_to_forget_weights,
+ recurrent_to_cell_weights, recurrent_to_output_weights);
+ }
ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);
ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, cell_bias, output_gate_bias);
@@ -623,20 +851,25 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in);
// Check whether peephole weights are all there or none
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+ DataType::QSYMM16);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+ lstm_params.cell_to_output_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+ lstm_params.cell_to_output_weights());
- if(!lstm_params.has_cifg_opt())
+ if (!lstm_params.has_cifg_opt())
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+ lstm_params.cell_to_input_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+ lstm_params.cell_to_input_weights());
}
}
@@ -650,7 +883,7 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
// Calculate quantized parameters for clipping.
int16_t quantized_cell_clip = 0;
- if(lstm_params.cell_clip() > 0.0f)
+ if (lstm_params.cell_clip() > 0.0f)
{
quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
}
@@ -658,49 +891,90 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
// Precompute effective bias for optimizing the matmul computations.
const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32);
const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32);
- if(!lstm_params.has_cifg_opt())
+ if (!lstm_params.has_cifg_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
- true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+ lstm_params.input_to_input_weights(), &eff_bias_info,
+ GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+ lstm_params.recurrent_to_input_weights(), &eff_bias_info,
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
}
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
- if(lstm_params.has_projection())
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+ input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+ recurrent_to_forget_weights, &eff_bias_info,
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+ input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+ recurrent_to_cell_weights, &eff_bias_info,
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+ input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+ recurrent_to_output_weights, &eff_bias_info,
+ GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+ if (lstm_params.has_projection())
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false,
- lstm_params.hidden_state_zero(),
- true)));
- if(lstm_params.projection_bias() != nullptr)
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+ lstm_params.projection_weights(), &projection_eff_bias_info,
+ GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)));
+ if (lstm_params.projection_bias() != nullptr)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32);
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, &projection_eff_bias_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info,
+ &projection_eff_bias_info, ConvertPolicy::SATURATE));
}
}
- const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info());
- const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info());
+ const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_cell_weights->data_type(),
+ input_to_cell_weights->quantization_info());
+ const TensorInfo input_to_output_weights_transposed(TensorShape(num_units, input_size), 1,
+ input_to_output_weights->data_type(),
+ input_to_output_weights->quantization_info());
+ const TensorInfo recurrent_to_forget_weights_transposed(TensorShape(num_units, output_size), 1,
+ recurrent_to_forget_weights->data_type(),
+ recurrent_to_forget_weights->quantization_info());
+ const TensorInfo recurrent_to_cell_weights_transposed(TensorShape(num_units, output_size), 1,
+ recurrent_to_cell_weights->data_type(),
+ recurrent_to_cell_weights->quantization_info());
+ const TensorInfo recurrent_to_output_weights_transposed(TensorShape(num_units, output_size), 1,
+ recurrent_to_output_weights->data_type(),
+ recurrent_to_output_weights->quantization_info());
+ const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1,
+ recurrent_to_forget_weights->data_type(),
+ recurrent_to_forget_weights->quantization_info());
- // Validate weights transpose
- ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_forget_weights, &input_weights_transposed));
ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_cell_weights, &input_weights_transposed));
- ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_output_weights, &input_weights_transposed));
- ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_forget_weights, &recurrent_weights_transposed));
- ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_cell_weights, &recurrent_weights_transposed));
- ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_output_weights, &recurrent_weights_transposed));
- if(!lstm_params.has_cifg_opt())
+ ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_output_weights, &input_to_output_weights_transposed));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NETranspose::validate(recurrent_to_forget_weights, &recurrent_to_forget_weights_transposed));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NETranspose::validate(recurrent_to_cell_weights, &recurrent_to_cell_weights_transposed));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NETranspose::validate(recurrent_to_output_weights, &recurrent_to_output_weights_transposed));
+ if (!lstm_params.has_cifg_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed));
- ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed));
+ const TensorInfo recurrent_to_input_weights_transposed(
+ TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(),
+ lstm_params.recurrent_to_input_weights()->quantization_info());
+ const TensorInfo input_to_input_weights_transposed(TensorShape(num_units, input_size), 1,
+ lstm_params.input_to_input_weights()->data_type(),
+ lstm_params.input_to_input_weights()->quantization_info());
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NETranspose::validate(lstm_params.input_to_input_weights(), &input_to_input_weights_transposed));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_to_input_weights_transposed));
}
- if(lstm_params.has_projection())
+ if (lstm_params.has_projection())
{
- const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
- ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
+ const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+ lstm_params.projection_weights()->data_type(),
+ lstm_params.projection_weights()->quantization_info());
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
}
GEMMLowpOutputStageInfo gemmlowp_info;
@@ -713,28 +987,42 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
// Forget gate.
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0);
- const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+ const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
- const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info));
+ const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale /
+ lstm_params.forget_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+ input_to_forget_scale, &mm_out_info, &forget_outstage_info));
- const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info));
+ const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+ &eff_bias_info, recurrent_to_forget_scale, &mm_out_info,
+ &forget_outstage_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+ &forget_outstage_info, ConvertPolicy::SATURATE));
- if(lstm_params.has_peephole_opt())
+ if (lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
- const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+ DataType::QSYMM16);
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f,
+ ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ const float cell_to_forget_scale = std::pow(2, cell_shift) *
+ lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale /
+ lstm_params.forget_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+ cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+ &forget_outstage_info, ConvertPolicy::SATURATE));
}
- if(has_layer_norm)
+ if (has_layer_norm)
{
const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights();
const ITensorInfo *b_info = forget_gate_bias;
@@ -743,22 +1031,31 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
// Output quantization info of Sigmoid and Tanh activations
const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
- const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
+ const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// Modulation gate.
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0);
- const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
- const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info));
-
- const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info));
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE));
-
- if(has_layer_norm)
+ const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+ const float input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale /
+ lstm_params.cell_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+ input_to_cell_scale, &mm_out_info, &cell_outstage_info));
+
+ const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+ &eff_bias_info, recurrent_to_cell_scale, &mm_out_info,
+ &cell_outstage_info));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info,
+ &cell_outstage_info, ConvertPolicy::SATURATE));
+
+ if (has_layer_norm)
{
const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights();
const ITensorInfo *b_info = cell_bias;
@@ -766,85 +1063,134 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
}
const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEActivationLayer::validate(&cell_outstage_info, &cell_gate_info,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
// Input gate.
const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
- if(lstm_params.has_cifg_opt())
+ if (lstm_params.has_cifg_opt())
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used");
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr,
+ "Input gate bias must not be present when CIFG is used");
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info,
+ &forget_gate_info, ConvertPolicy::SATURATE));
}
else
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
+ lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
+
+ // If the input_to_forget_weights data type is DataType::QSYMM8 then it can never match the other weights as they are all DataType::QASYMM8_SIGNED
+ if (input_to_forget_weights->data_type() == DataType::QSYMM8)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.input_to_input_weights(),
+ lstm_params.recurrent_to_input_weights());
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights,
+ lstm_params.input_to_input_weights(),
+ lstm_params.recurrent_to_input_weights());
+ }
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights());
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights,
+ lstm_params.recurrent_to_input_weights());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias());
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias());
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0);
- const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
- const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info));
-
- const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info));
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
-
- if(lstm_params.has_peephole_opt())
+ const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+ const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale *
+ qinput.scale / lstm_params.input_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+ input_to_input_scale, &mm_out_info, &input_outstage_info));
+
+ const float recurrent_to_input_scale =
+ lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale /
+ lstm_params.input_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+ &eff_bias_info, recurrent_to_input_scale, &mm_out_info,
+ &input_outstage_info));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+ &input_outstage_info, ConvertPolicy::SATURATE));
+
+ if (lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
- const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info,
+ 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ const float cell_to_input_scale = std::pow(2, cell_shift) *
+ lstm_params.cell_to_input_weights()->quantization_info().uniform().scale /
+ lstm_params.input_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+ cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+ &input_outstage_info, ConvertPolicy::SATURATE));
}
- if(has_layer_norm)
+ if (has_layer_norm)
{
const ITensorInfo *w_info = lstm_params.input_layer_norm_weights();
const ITensorInfo *b_info = lstm_params.input_gate_bias();
ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(input_outstage_info, *w_info, *b_info));
}
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEActivationLayer::validate(&input_outstage_info, &input_gate_info,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
}
// Cell.
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
- if(quantized_cell_clip > 0)
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+ &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+ &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
+ if (quantized_cell_clip > 0)
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip,
- quantized_cell_clip)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEActivationLayer::validate(cell_state_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ -quantized_cell_clip, quantized_cell_clip)));
}
// Output gate.
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0);
- const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
- const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info));
-
- const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info));
-
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
- if(lstm_params.has_peephole_opt())
+ const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+ QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+ const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale /
+ lstm_params.output_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+ input_to_output_scale, &mm_out_info, &output_outstage_info));
+
+ const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale *
+ qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+ &eff_bias_info, recurrent_to_output_scale, &mm_out_info,
+ &output_outstage_info));
+
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+ &output_outstage_info, ConvertPolicy::SATURATE));
+ if (lstm_params.has_peephole_opt())
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1,
+ DataType::QSYMM16);
// TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
// Here we are not using the output stage because all operations are done in float
// const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
// ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
- RoundingPolicy::TO_ZERO));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+ cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
+ RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+ &output_outstage_info, ConvertPolicy::SATURATE));
}
- if(has_layer_norm)
+ if (has_layer_norm)
{
const ITensorInfo *w_info = lstm_params.output_layer_norm_weights();
const ITensorInfo *b_info = output_gate_bias;
@@ -852,85 +1198,103 @@ Status NEQLSTMLayer::validate(const ITensorInfo *input,
}
const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEActivationLayer::validate(&output_outstage_info, &output_gate_info,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
// Hidden.
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEActivationLayer::validate(cell_state_out, &input_gate_info,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32);
const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+ &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0);
const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+ &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();
gemmlowp_info.output_data_type = hidden_out_info.data_type();
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
const bool projection_tensor_copy_required = num_units != output_size;
// Projection.
- if(lstm_params.has_projection())
+ if (lstm_params.has_projection())
{
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights());
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights,
+ lstm_params.projection_weights());
ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0);
- const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform();
- const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+ const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform();
+ const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+ projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
gemmlowp_info.output_data_type = DataType::QASYMM8_SIGNED;
const TensorInfo projection_outstage_info(*output_state_out);
- const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
+ const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+ lstm_params.projection_weights()->data_type(),
+ lstm_params.projection_weights()->quantization_info());
- TensorInfo projection_mm_out_info{ mm_out_info };
+ TensorInfo projection_mm_out_info{mm_out_info};
projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed,
+ &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
&projection_outstage_info));
- if(projection_tensor_copy_required)
+ if (projection_tensor_copy_required)
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
}
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out,
+ ConvertPolicy::SATURATE));
- if(projection_tensor_copy_required)
+ if (projection_tensor_copy_required)
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
}
- int8_t quantized_projection_clip{ 0 };
- if(lstm_params.projection_clip() > 0.0f)
+ int8_t quantized_projection_clip{0};
+ if (lstm_params.projection_clip() > 0.0f)
{
quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection);
}
- if(quantized_projection_clip > 0)
+ if (quantized_projection_clip > 0)
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
- quantized_projection_clip)));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+ output_state_out, nullptr,
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+ -quantized_projection_clip, quantized_projection_clip)));
}
}
else
{
- if(projection_tensor_copy_required)
+ if (projection_tensor_copy_required)
{
ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out));
}
}
- if(cell_state_out->total_size() > 0)
+ if (cell_state_out->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out);
}
- if(output_state_out->total_size() > 0)
+ if (output_state_out->total_size() > 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out);
@@ -955,14 +1319,14 @@ void NEQLSTMLayer::run()
_recurrent_to_forget_outstage.run();
_accumulate_input_recurrent_forget.run();
- if(_has_peephole)
+ if (_has_peephole)
{
_pixelwise_mul_cell_to_forget.run();
_cell_to_forget_outstage.run();
_accumulate_cell_forget.run();
}
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Forget).get(), Window::DimY);
}
@@ -977,7 +1341,7 @@ void NEQLSTMLayer::run()
_recurrent_to_cell_outstage.run();
_accumulate_input_recurrent_modulation.run();
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Cell).get(), Window::DimY);
}
@@ -985,7 +1349,7 @@ void NEQLSTMLayer::run()
_cell_gate_tanh.run();
// Input gate
- if(_has_cifg)
+ if (_has_cifg)
{
_input_gate_sub.run();
}
@@ -997,14 +1361,14 @@ void NEQLSTMLayer::run()
_recurrent_to_input_outstage.run();
_accumulate_input_recurrent_input.run();
- if(_has_peephole)
+ if (_has_peephole)
{
_pixelwise_mul_cell_to_input.run();
_cell_to_input_outstage.run();
_accumulate_cell_input.run();
}
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Input).get(), Window::DimY);
}
@@ -1017,7 +1381,7 @@ void NEQLSTMLayer::run()
_pixelwise_mul_input_cell.run();
_add_forget_cell.run();
- if(_has_cell_clipping)
+ if (_has_cell_clipping)
{
_cell_clip.run();
}
@@ -1028,14 +1392,14 @@ void NEQLSTMLayer::run()
_mm_recurrent_to_output.run();
_recurrent_to_output_outstage.run();
_accumulate_input_recurrent_output.run();
- if(_has_peephole)
+ if (_has_peephole)
{
_pixelwise_mul_cell_to_output.run();
_cell_to_output_outstage.run();
_accumulate_cell_to_output.run();
}
- if(_has_layer_norm)
+ if (_has_layer_norm)
{
NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Output).get(), Window::DimY);
}
@@ -1048,31 +1412,31 @@ void NEQLSTMLayer::run()
_hidden_outstage.run();
// Projection.
- if(_has_projection)
+ if (_has_projection)
{
_mm_projection.run();
_projection_outstage.run();
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_projection_output_to_accumulate_copy.run();
}
_accumulate_projection.run();
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_projection_accumulate_to_output_copy.run();
}
- if(_has_projection_clipping)
+ if (_has_projection_clipping)
{
_projection_clip.run();
}
}
else
{
- if(_projection_tensor_copy_required)
+ if (_projection_tensor_copy_required)
{
_hidden_to_output_copy.run();
}
@@ -1084,8 +1448,16 @@ void NEQLSTMLayer::run()
void NEQLSTMLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
+ if (_convert_input_to_forget_weights_to_qsymm8)
+ {
+ _input_to_forget_weights_f32.allocator()->allocate();
+ _input_to_forget_weights_symm8.allocator()->allocate();
+ _dequantize_input_to_forget_weights.run();
+ _quantize_input_to_forget_weights.run();
+ }
+
// Pre-transpose weights to be used in GEMM.
_input_to_forget_weights_transposed.allocator()->allocate();
_input_to_cell_weights_transposed.allocator()->allocate();
@@ -1101,16 +1473,25 @@ void NEQLSTMLayer::prepare()
_transpose_recurrent_to_output_weights.run();
// Precompute effective biases
- if(_has_cifg)
+ if (_has_cifg)
{
- std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767);
+ std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()),
+ _ones.info()->total_size() / _ones.info()->element_size(), 32767);
}
else
{
_input_to_input_eff_bias.allocator()->allocate();
_recurrent_to_input_eff_bias.allocator()->allocate();
- NEScheduler::get().schedule(_input_to_input_reduction.get(), Window::DimY);
- NEScheduler::get().schedule(_recurrent_to_input_reduction.get(), Window::DimY);
+
+ ITensorPack packII = {{TensorType::ACL_SRC, _input_to_input_weights},
+ {TensorType::ACL_DST, &_input_to_input_eff_bias}};
+ NEScheduler::get().schedule_op(_input_to_input_reduction.get(), Window::DimY,
+ _input_to_input_reduction->window(), packII);
+
+ ITensorPack packRI = {{TensorType::ACL_SRC, _recurrent_to_input_weights},
+ {TensorType::ACL_DST, &_recurrent_to_input_eff_bias}};
+ NEScheduler::get().schedule_op(_recurrent_to_input_reduction.get(), Window::DimY,
+ _recurrent_to_input_reduction->window(), packRI);
_input_to_input_weights_transposed.allocator()->allocate();
_recurrent_to_input_weights_transposed.allocator()->allocate();
@@ -1125,18 +1506,45 @@ void NEQLSTMLayer::prepare()
_recurrent_to_cell_eff_bias.allocator()->allocate();
_input_to_output_eff_bias.allocator()->allocate();
_recurrent_to_output_eff_bias.allocator()->allocate();
- NEScheduler::get().schedule(_input_to_forget_reduction.get(), Window::DimY);
- NEScheduler::get().schedule(_recurrent_to_forget_reduction.get(), Window::DimY);
- NEScheduler::get().schedule(_input_to_cell_reduction.get(), Window::DimY);
- NEScheduler::get().schedule(_recurrent_to_cell_reduction.get(), Window::DimY);
- NEScheduler::get().schedule(_input_to_output_reduction.get(), Window::DimY);
- NEScheduler::get().schedule(_recurrent_to_output_reduction.get(), Window::DimY);
-
- if(_has_projection)
+
+ ITensorPack packIF = {{TensorType::ACL_SRC, _input_to_forget_weights},
+ {TensorType::ACL_DST, &_input_to_forget_eff_bias}};
+ NEScheduler::get().schedule_op(_input_to_forget_reduction.get(), Window::DimY,
+ _input_to_forget_reduction->window(), packIF);
+
+ ITensorPack packRF = {{TensorType::ACL_SRC, _recurrent_to_forget_weights},
+ {TensorType::ACL_DST, &_recurrent_to_forget_eff_bias}};
+ NEScheduler::get().schedule_op(_recurrent_to_forget_reduction.get(), Window::DimY,
+ _recurrent_to_forget_reduction->window(), packRF);
+
+ ITensorPack packIC = {{TensorType::ACL_SRC, _input_to_cell_weights},
+ {TensorType::ACL_DST, &_input_to_cell_eff_bias}};
+ NEScheduler::get().schedule_op(_input_to_cell_reduction.get(), Window::DimY, _input_to_cell_reduction->window(),
+ packIC);
+
+ ITensorPack packRC = {{TensorType::ACL_SRC, _recurrent_to_cell_weights},
+ {TensorType::ACL_DST, &_recurrent_to_cell_eff_bias}};
+ NEScheduler::get().schedule_op(_recurrent_to_cell_reduction.get(), Window::DimY,
+ _recurrent_to_cell_reduction->window(), packRC);
+
+ ITensorPack packIO = {{TensorType::ACL_SRC, _input_to_output_weights},
+ {TensorType::ACL_DST, &_input_to_output_eff_bias}};
+ NEScheduler::get().schedule_op(_input_to_output_reduction.get(), Window::DimY,
+ _input_to_output_reduction->window(), packIO);
+
+ ITensorPack packRO = {{TensorType::ACL_SRC, _recurrent_to_output_weights},
+ {TensorType::ACL_DST, &_recurrent_to_output_eff_bias}};
+ NEScheduler::get().schedule_op(_recurrent_to_output_reduction.get(), Window::DimY,
+ _recurrent_to_output_reduction->window(), packRO);
+
+ if (_has_projection)
{
_projection_eff_bias.allocator()->allocate();
- NEScheduler::get().schedule(_projection_reduction.get(), Window::DimY);
- if(_projection_bias != nullptr)
+ ITensorPack pack = {{TensorType::ACL_SRC, _projection_weights},
+ {TensorType::ACL_DST, &_projection_eff_bias}};
+ NEScheduler::get().schedule_op(_projection_reduction.get(), Window::DimY, _projection_reduction->window(),
+ pack);
+ if (_projection_bias != nullptr)
{
_projection_bias_add.run();
_projection_bias->mark_as_unused();
@@ -1146,7 +1554,7 @@ void NEQLSTMLayer::prepare()
_transpose_projection_weights.run();
_projection_weights->mark_as_unused();
- if(!_projection_tensor_copy_required)
+ if (!_projection_tensor_copy_required)
{
_hidden_gate.mark_as_unused();
_projection_accumulate_res.mark_as_unused();
diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index e607917615..9b72783c97 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
@@ -26,19 +26,19 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/Tensor.h"
-#include "src/runtime/cpu/operators/CpuQuantize.h"
+
+#include "src/cpu/operators/CpuQuantize.h"
namespace arm_compute
{
struct NEQuantizationLayer::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuQuantize> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuQuantize> op{nullptr};
};
-NEQuantizationLayer::NEQuantizationLayer()
- : _impl(std::make_unique<Impl>())
+NEQuantizationLayer::NEQuantizationLayer() : _impl(std::make_unique<Impl>())
{
}
NEQuantizationLayer::~NEQuantizationLayer() = default;
diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp
index d59f7da0dd..2824693800 100644
--- a/src/runtime/NEON/functions/NERNNLayer.cpp
+++ b/src/runtime/NEON/functions/NERNNLayer.cpp
@@ -27,31 +27,37 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+
+#include "src/common/utils/Log.h"
namespace arm_compute
{
NERNNLayer::~NERNNLayer() = default;
NERNNLayer::NERNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_f(), _activation(), _fully_connected(memory_manager), _copy_f(), _fully_connected_out(), _gemm_output(), _add_output(),
+ : _memory_group(std::move(memory_manager)),
+ _gemm_state_f(),
+ _add_f(),
+ _activation(),
+ _fully_connected(memory_manager),
+ _copy_f(),
+ _fully_connected_out(),
+ _gemm_output(),
+ _add_output(),
_is_prepared(false)
{
}
-Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
- const ITensorInfo *output, const ActivationLayerInfo &info)
+Status NERNNLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *recurrent_weights,
+ const ITensorInfo *bias,
+ const ITensorInfo *hidden_state,
+ const ITensorInfo *output,
+ const ActivationLayerInfo &info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
@@ -68,23 +74,34 @@ Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights
ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape());
- auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
+ auto shape_info =
+ TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1,
+ input->data_type());
ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info));
- ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&shape_info, &shape_info, info));
return Status{};
}
-void NERNNLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, const ITensor *bias, ITensor *hidden_state, ITensor *output,
+void NERNNLayer::configure(const ITensor *input,
+ const ITensor *weights,
+ const ITensor *recurrent_weights,
+ const ITensor *bias,
+ ITensor *hidden_state,
+ ITensor *output,
ActivationLayerInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
- ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info));
+ ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(),
+ bias->info(), hidden_state->info(), output->info(), info));
+ ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info);
const int idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
- TensorShape shape = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+ TensorShape shape = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(),
+ hidden_state->info()->dimension(idx_height));
_is_prepared = false;
@@ -132,7 +149,7 @@ void NERNNLayer::run()
void NERNNLayer::prepare()
{
- if(!_is_prepared)
+ if (!_is_prepared)
{
_fully_connected.prepare();
_gemm_state_f.prepare();
diff --git a/src/runtime/NEON/functions/NEROIAlignLayer.cpp b/src/runtime/NEON/functions/NEROIAlignLayer.cpp
index a946358e18..68bb5d5ef3 100644
--- a/src/runtime/NEON/functions/NEROIAlignLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIAlignLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,20 +23,29 @@
*/
#include "arm_compute/runtime/NEON/functions/NEROIAlignLayer.h"
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEFillBorderKernel.h"
#include "src/core/NEON/kernels/NEROIAlignLayerKernel.h"
namespace arm_compute
{
-Status NEROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIAlignLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ON_ERROR(NEROIAlignLayerKernel::validate(input, rois, output, pool_info));
return Status{};
}
-void NEROIAlignLayer::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIAlignLayer::configure(const ITensor *input,
+ const ITensor *rois,
+ ITensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
+ ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
+
// Configure ROI pooling kernel
auto k = std::make_unique<NEROIAlignLayerKernel>();
k->configure(input, rois, output, pool_info);
diff --git a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
index f9434059ea..babec4aa92 100644
--- a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
@@ -22,26 +22,36 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h"
+
#include "arm_compute/core/Helpers.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
namespace arm_compute
{
NEROIPoolingLayer::~NEROIPoolingLayer() = default;
-NEROIPoolingLayer::NEROIPoolingLayer()
- : _roi_kernel()
+NEROIPoolingLayer::NEROIPoolingLayer() : _roi_kernel()
{
}
-Status NEROIPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIPoolingLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *rois,
+ const ITensorInfo *output,
+ const ROIPoolingLayerInfo &pool_info)
{
return NEROIPoolingLayerKernel::validate(input, rois, output, pool_info);
}
-void NEROIPoolingLayer::configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIPoolingLayer::configure(const ITensor *input,
+ const ITensor *rois,
+ const ITensor *output,
+ const ROIPoolingLayerInfo &pool_info)
{
+ ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
+
_roi_kernel = std::make_unique<NEROIPoolingLayerKernel>();
_roi_kernel->configure(input, rois, output, pool_info);
}
@@ -50,4 +60,4 @@ void NEROIPoolingLayer::run()
{
NEScheduler::get().schedule(_roi_kernel.get(), Window::DimX);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NERange.cpp b/src/runtime/NEON/functions/NERange.cpp
index 56ef2bf657..95492df126 100644
--- a/src/runtime/NEON/functions/NERange.cpp
+++ b/src/runtime/NEON/functions/NERange.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,19 +24,21 @@
#include "arm_compute/runtime/NEON/functions/NERange.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NERangeKernel.h"
namespace arm_compute
{
NERange::~NERange() = default;
-NERange::NERange()
- : _kernel()
+NERange::NERange() : _kernel()
{
}
void NERange::configure(ITensor *output, const float start, const float end, const float step)
{
+ ARM_COMPUTE_LOG_PARAMS(output, start, end, step);
_kernel = std::make_unique<NERangeKernel>();
_kernel->configure(output, start, end, step);
}
@@ -50,4 +52,4 @@ void NERange::run()
{
NEScheduler::get().schedule(_kernel.get(), Window::DimX);
}
-} // namespace arm_compute \ No newline at end of file
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index b50a925f44..a23db87059 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,23 +24,25 @@
#include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
namespace arm_compute
{
namespace
{
-Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status
+validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
{
ARM_COMPUTE_UNUSED(keep_dims);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+ DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
@@ -48,29 +50,36 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
const int input_dims = input->num_dimensions();
Coordinates axis_local = reduction_axis;
- for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
+ for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
{
//axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)).
ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions())));
ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions()));
}
- if(output->tensor_shape().total_size() != 0)
+ if (output->tensor_shape().total_size() != 0)
{
// Only validate if not using auto_init for the output tensor
TensorShape out_shape = input->tensor_shape();
// Validate output_shape only if not using auto_init
convert_negative_axis(axis_local, input_dims);
+
+// Suppress warning produced by a compiler bug in GCC
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
- for(unsigned int i = 0; i < reduction_ops; ++i)
+#pragma GCC diagnostic pop
+
+ for (unsigned int i = 0; i < reduction_ops; ++i)
{
ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
- if(output->total_size() > 0 && keep_dims)
+ if (output->total_size() > 0 && keep_dims)
{
ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
}
- if(keep_dims)
+ if (keep_dims)
{
out_shape.set(axis_local[i], 1);
}
@@ -79,19 +88,11 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
ARM_COMPUTE_RETURN_ERROR_ON(i > static_cast<unsigned int>(axis_local[i]));
const unsigned int remove_index = axis_local[i] - i;
ARM_COMPUTE_RETURN_ERROR_ON(remove_index >= out_shape.num_dimensions());
- out_shape.remove_dimension(remove_index);
+ out_shape.remove_dimension(remove_index, false);
}
}
const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
- const bool requant = is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info();
- if(requant)
- {
- TensorInfo input_no_quant(input->clone()->set_data_type(DataType::F32));
- NEDequantizationLayer::validate(input, &input_no_quant);
- TensorInfo output_no_quant(output->clone()->set_data_type(DataType::F32));
- NEQuantizationLayer::validate(&output_no_quant, output);
- }
}
return Status{};
}
@@ -100,25 +101,34 @@ Status validate_config(const ITensorInfo *input, const Coordinates &reduction_ax
NEReduceMean::~NEReduceMean() = default;
NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _dequant(), _requant(), _reduction_ops(), _keep_dims(), _do_requant(), _input_no_quant(),
- _output_no_quant()
+ : _memory_group(std::move(memory_manager)),
+ _reduction_kernels(),
+ _reduced_outs(),
+ _reshape(),
+ _reduction_ops(),
+ _keep_dims()
{
}
-Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status NEReduceMean::validate(const ITensorInfo *input,
+ const Coordinates &reduction_axis,
+ bool keep_dims,
+ const ITensorInfo *output)
{
return validate_config(input, reduction_axis, keep_dims, output);
}
void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output)
{
+ ARM_COMPUTE_LOG_PARAMS(input, reduction_axis, keep_dims, output);
+
// Perform validate step
ARM_COMPUTE_ERROR_THROW_ON(NEReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
// Output auto inizialitation if not yet initialized
- const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
+ const TensorShape output_shape =
+ arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
- _do_requant = is_data_type_quantized(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info();
_reduction_ops = reduction_axis.num_dimensions();
_reduction_kernels.resize(_reduction_ops);
_reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
@@ -126,18 +136,6 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
ITensor *tmp_input = input;
ITensor *tmp_output = output;
- if(_do_requant)
- {
- _memory_group.manage(&_input_no_quant);
- _memory_group.manage(&_output_no_quant);
- TensorInfo output_no_quant_info = input->info()->clone()->set_tensor_shape(output_shape);
- output_no_quant_info.set_data_type(DataType::F32);
- auto_init_if_empty(*_output_no_quant.info(), output_no_quant_info);
- auto_init_if_empty(*_input_no_quant.info(), input->info()->clone()->set_data_type(DataType::F32));
- _dequant.configure(input, &_input_no_quant);
- tmp_input = &_input_no_quant;
- tmp_output = &_output_no_quant;
- }
Coordinates axis_local = reduction_axis;
const int input_dims = tmp_input->info()->num_dimensions();
@@ -145,70 +143,65 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
convert_negative_axis(axis_local, input_dims);
// Perform reduction for every axis
- for(int i = 0; i < _reduction_ops; ++i)
+ for (int i = 0; i < _reduction_ops; ++i)
{
- TensorShape out_shape = i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+ TensorShape out_shape =
+ i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
out_shape.set(axis_local[i], 1);
auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]);
- if(i == _reduction_ops - 1 && keep_dims)
+ if (i == _reduction_ops - 1 && keep_dims)
{
_reduction_kernels[i].configure(in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM);
}
else
{
- _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(), tmp_input->info()->data_type(), tmp_input->info()->quantization_info()));
+ _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_output->info()->num_channels(),
+ tmp_output->info()->data_type(),
+ tmp_output->info()->quantization_info()));
_memory_group.manage(&_reduced_outs[i]);
_reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
}
}
// Allocate intermediate tensors
- for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+ for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
{
_reduced_outs[i].allocator()->allocate();
}
-
// Configure reshape layer if we want to drop the dimensions
- if(!keep_dims)
+ if (!keep_dims)
{
TensorShape out_shape = tmp_input->info()->tensor_shape();
// We have to sort the reduction axis vectors in order for remove_dimension
// to work properly
+
+// Suppress warning produced by a compiler bug in GCC
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
- for(int i = 0; i < _reduction_ops; ++i)
+#pragma GCC diagnostic pop
+
+ for (int i = 0; i < _reduction_ops; ++i)
{
- out_shape.remove_dimension(axis_local[i] - i);
+ out_shape.remove_dimension(axis_local[i] - i, false);
}
auto_init_if_empty(*tmp_output->info(), tmp_input->info()->clone()->set_tensor_shape(out_shape));
_reshape.configure(&_reduced_outs[_reduction_ops - 1], tmp_output);
}
- if(_do_requant)
- {
- _requant.configure(&_output_no_quant, output);
- _input_no_quant.allocator()->allocate();
- _output_no_quant.allocator()->allocate();
- }
}
void NEReduceMean::run()
{
MemoryGroupResourceScope scope_mg(_memory_group);
- if(_do_requant)
- {
- _dequant.run();
- }
- for(auto &kernel : _reduction_kernels)
+ for (auto &kernel : _reduction_kernels)
{
kernel.run();
}
- if(!_keep_dims)
+ if (!_keep_dims)
{
_reshape.run();
}
- if(_do_requant)
- {
- _requant.run();
- }
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 5d6f520a52..8540d750fc 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,8 +26,10 @@
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
namespace arm_compute
{
@@ -41,7 +43,7 @@ namespace
*/
size_t reduction_window_split_dimension(unsigned int axis)
{
- switch(axis)
+ switch (axis)
{
case 0:
return Window::DimY;
@@ -58,13 +60,21 @@ size_t reduction_window_split_dimension(unsigned int axis)
NEReductionOperation::~NEReductionOperation() = default;
NEReductionOperation::NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _reduction_kernel(), _reshape(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false)
+ : _memory_group(memory_manager),
+ _reduction_kernel(),
+ _reshape(),
+ _output_internal(),
+ _window_split(0),
+ _reduction_axis(),
+ _is_reshape_required(false)
{
}
-Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+Status NEReductionOperation::validate(
+ const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+ "Reduction axis greater than max number of dimensions");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
const auto is_reshape_required = !keep_dims;
@@ -73,9 +83,10 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf
TensorInfo info_before_reshape;
- if(is_reshape_required)
+ if (is_reshape_required)
{
- const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
+ const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
auto shape_before_reshape = input->tensor_shape();
@@ -83,17 +94,20 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf
const auto input_num_channles = input->num_channels();
const auto input_qinfo = input->quantization_info();
- const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
- const auto output_data_type = is_arg_min_max ? DataType::S32 : output->data_type();
+ const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
+ const auto output_data_type = is_arg_min_max ? DataType::S32 : output->data_type();
- info_before_reshape.set_data_type(output_data_type).set_tensor_shape(shape_before_reshape).set_num_channels(input_num_channles).set_quantization_info(input_qinfo);
+ info_before_reshape.set_data_type(output_data_type)
+ .set_tensor_shape(shape_before_reshape)
+ .set_num_channels(input_num_channles)
+ .set_quantization_info(input_qinfo);
output_internal = &info_before_reshape;
}
ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output_internal, axis, op));
- if(is_reshape_required)
+ if (is_reshape_required)
{
ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(output_internal, output));
}
@@ -101,28 +115,43 @@ Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInf
return Status{};
}
-void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+void NEReductionOperation::configure(
+ ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims);
_is_reshape_required = !keep_dims;
auto *output_internal = output;
const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
- if(_is_reshape_required)
+ if (_is_reshape_required)
{
- const auto output_internal_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
- const auto output_external_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
- const auto output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type();
- const auto num_channels = input->info()->num_channels();
- const auto qinfo = input->info()->quantization_info();
-
- _output_internal.allocator()->init(input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_internal_shape).reset_padding().set_is_resizable(true).set_num_channels(
- num_channels).set_quantization_info(qinfo));
+ const auto output_internal_shape =
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
+ const auto output_external_shape =
+ arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+ const auto output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type();
+ const auto num_channels = input->info()->num_channels();
+ const auto qinfo = input->info()->quantization_info();
+
+ _output_internal.allocator()->init(input->info()
+ ->clone()
+ ->set_data_type(output_data_type)
+ .set_tensor_shape(output_internal_shape)
+ .reset_padding()
+ .set_is_resizable(true)
+ .set_num_channels(num_channels)
+ .set_quantization_info(qinfo));
_memory_group.manage(&_output_internal);
output_internal = &_output_internal;
- auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_external_shape).reset_padding().set_is_resizable(true));
+ auto_init_if_empty(*output->info(), input->info()
+ ->clone()
+ ->set_data_type(output_data_type)
+ .set_tensor_shape(output_external_shape)
+ .reset_padding()
+ .set_is_resizable(true));
}
ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op, keep_dims));
@@ -133,7 +162,7 @@ void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned i
_window_split = reduction_window_split_dimension(axis);
_reduction_axis = axis;
- if(_is_reshape_required)
+ if (_is_reshape_required)
{
_reshape.configure(output_internal, output);
_output_internal.allocator()->allocate();
@@ -144,7 +173,7 @@ void NEReductionOperation::run()
{
MemoryGroupResourceScope scope_mg(_memory_group);
NEScheduler::get().schedule(_reduction_kernel.get(), _window_split);
- if(_is_reshape_required)
+ if (_is_reshape_required)
{
_reshape.run();
}
diff --git a/src/runtime/NEON/functions/NERemap.cpp b/src/runtime/NEON/functions/NERemap.cpp
deleted file mode 100644
index d9fd987480..0000000000
--- a/src/runtime/NEON/functions/NERemap.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NERemap.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "src/core/NEON/kernels/NERemapKernel.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-void NERemap::configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported");
-
- auto k = std::make_unique<NERemapKernel>();
- k->configure(input, map_x, map_y, output, policy, border_mode, constant_border_value);
- _kernel = std::move(k);
-}
-} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReorderLayer.cpp b/src/runtime/NEON/functions/NEReorderLayer.cpp
new file mode 100644
index 0000000000..89cf575f38
--- /dev/null
+++ b/src/runtime/NEON/functions/NEReorderLayer.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__aarch64__)
+
+#include "arm_compute/runtime/NEON/functions/NEReorderLayer.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/core/NEON/kernels/NEReorderKernel.h"
+
+namespace arm_compute
+{
+NEReorderLayer::~NEReorderLayer() = default;
+
+NEReorderLayer::NEReorderLayer() : _reorder_kernel(std::make_unique<NEReorderKernel>())
+{
+}
+
+void NEReorderLayer::configure(const ITensor *input,
+ ITensor *output,
+ arm_compute::WeightFormat input_wf,
+ arm_compute::WeightFormat output_wf)
+{
+ auto k = std::make_unique<NEReorderKernel>();
+ k->configure(input, output, input_wf, output_wf);
+ _reorder_kernel = std::move(k);
+}
+
+void NEReorderLayer::run()
+{
+ // Run Reorder
+ NEScheduler::get().schedule(_reorder_kernel.get(), Window::DimX);
+}
+
+Status NEReorderLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ arm_compute::WeightFormat input_wf,
+ arm_compute::WeightFormat output_wf)
+{
+ return NEReorderKernel::validate(input, output, input_wf, output_wf);
+}
+
+} // namespace arm_compute
+
+#endif // defined(__aarch64__)
diff --git a/src/runtime/NEON/functions/NEReorgLayer.cpp b/src/runtime/NEON/functions/NEReorgLayer.cpp
index 23ca3a4eea..14e41d6df4 100644
--- a/src/runtime/NEON/functions/NEReorgLayer.cpp
+++ b/src/runtime/NEON/functions/NEReorgLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,12 +23,15 @@
*/
#include "arm_compute/runtime/NEON/functions/NEReorgLayer.h"
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEReorgLayerKernel.h"
namespace arm_compute
{
void NEReorgLayer::configure(const ITensor *input, ITensor *output, int32_t stride)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, stride);
+
auto k = std::make_unique<NEReorgLayerKernel>();
k->configure(input, output, stride);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp
index c0c78ea652..bed70ff66c 100644
--- a/src/runtime/NEON/functions/NEReshapeLayer.cpp
+++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp
@@ -24,7 +24,8 @@
#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
#include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuReshape.h"
+
+#include "src/cpu/operators/CpuReshape.h"
#include <utility>
@@ -32,16 +33,15 @@ namespace arm_compute
{
struct NEReshapeLayer::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuReshape> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuReshape> op{nullptr};
};
-NEReshapeLayer::NEReshapeLayer()
- : _impl(std::make_unique<Impl>())
+NEReshapeLayer::NEReshapeLayer() : _impl(std::make_unique<Impl>())
{
}
-NEReshapeLayer::NEReshapeLayer(NEReshapeLayer &&) = default;
+NEReshapeLayer::NEReshapeLayer(NEReshapeLayer &&) = default;
NEReshapeLayer &NEReshapeLayer::operator=(NEReshapeLayer &&) = default;
NEReshapeLayer::~NEReshapeLayer() = default;
diff --git a/src/runtime/NEON/functions/NEReverse.cpp b/src/runtime/NEON/functions/NEReverse.cpp
index 36127ef83c..a90f8d2e76 100644
--- a/src/runtime/NEON/functions/NEReverse.cpp
+++ b/src/runtime/NEON/functions/NEReverse.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,19 +23,25 @@
*/
#include "arm_compute/runtime/NEON/functions/NEReverse.h"
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEReverseKernel.h"
namespace arm_compute
{
-void NEReverse::configure(const ITensor *input, ITensor *output, const ITensor *axis)
+void NEReverse::configure(const ITensor *input, ITensor *output, const ITensor *axis, bool use_inverted_axis)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, axis);
+
auto k = std::make_unique<NEReverseKernel>();
- k->configure(input, output, axis);
+ k->configure(input, output, axis, use_inverted_axis);
_kernel = std::move(k);
}
-Status NEReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
+Status NEReverse::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const ITensorInfo *axis,
+ bool use_inverted_axis)
{
- return NEReverseKernel::validate(input, output, axis);
+ return NEReverseKernel::validate(input, output, axis, use_inverted_axis);
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index 0fbad07d0f..0d011064f6 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,32 +23,34 @@
*/
#include "arm_compute/runtime/NEON/functions/NEScale.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/Tensor.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/utils/ScaleUtils.h"
-#include "src/runtime/cpu/operators/CpuScale.h"
-#include "support/Rounding.h"
+#include "src/cpu/operators/CpuScale.h"
namespace arm_compute
{
struct NEScale::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- Tensor dx{ nullptr }; /**< Element's distance between the X real coordinate and the smallest X following integer */
- Tensor dy{ nullptr }; /**< Element's distance between the Y real coordinate and the smallest Y following integer */
- Tensor offsets{ nullptr }; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */
- std::unique_ptr<cpu::CpuScale> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ Tensor dx{nullptr}; /**< Element's distance between the X real coordinate and the smallest X following integer */
+ Tensor dy{nullptr}; /**< Element's distance between the Y real coordinate and the smallest Y following integer */
+ Tensor offsets{
+ nullptr}; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */
+ std::unique_ptr<cpu::CpuScale> op{nullptr};
};
-NEScale::NEScale()
- : _impl(std::make_unique<Impl>())
+NEScale::NEScale() : _impl(std::make_unique<Impl>())
{
}
NEScale::~NEScale() = default;
void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo &info)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, info);
+
_impl->src = input;
_impl->dst = output;
_impl->op = std::make_unique<cpu::CpuScale>();
@@ -56,50 +58,71 @@ void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo &
// Configure for size of allocation of internal tensors
// Get data layout and width/height indices
- const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : info.data_layout;
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const DataLayout data_layout =
+ info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : info.data_layout;
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
// Compute the ratio between source width/height and destination width/height
- const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
- const auto wr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used);
- const auto hr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used);
+ const bool is_align_corners_used =
+ info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
+ const auto wr = arm_compute::scale_utils::calculate_resize_ratio(
+ input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used);
+ const auto hr = arm_compute::scale_utils::calculate_resize_ratio(
+ input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used);
// Area interpolation behaves as Nearest Neighbour in case of up-sampling
- InterpolationPolicy policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy;
+ InterpolationPolicy policy_to_use =
+ (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+ ? InterpolationPolicy::NEAREST_NEIGHBOR
+ : info.interpolation_policy;
// Get the tensor shape
TensorShape shape(output->info()->dimension(idx_width));
shape.set(1, output->info()->dimension(idx_height), false);
- const TensorInfo tensor_info_dxdy(shape, Format::F32);
- const TensorInfo tensor_info_offsets(shape, Format::S32);
+ bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(
+ data_layout, input->info()->data_type(), policy_to_use, info.border_mode);
- _impl->dx.allocator()->init(tensor_info_dxdy);
- _impl->dy.allocator()->init(tensor_info_dxdy);
- _impl->offsets.allocator()->init(tensor_info_offsets);
- switch(policy_to_use)
+ if (precompute_indices_weights)
{
- case InterpolationPolicy::NEAREST_NEIGHBOR:
- {
- // Allocate once the configure methods have been called
- _impl->offsets.allocator()->allocate();
- break;
- }
- case InterpolationPolicy::BILINEAR:
+ const TensorInfo tensor_info_dxdy(shape, Format::F32);
+ const TensorInfo tensor_info_offsets(shape, Format::S32);
+
+ _impl->dx.allocator()->init(tensor_info_dxdy);
+ _impl->dy.allocator()->init(tensor_info_dxdy);
+ _impl->offsets.allocator()->init(tensor_info_offsets);
+ switch (policy_to_use)
{
- // Allocate once the configure methods have been called
- _impl->dx.allocator()->allocate();
- _impl->dy.allocator()->allocate();
- _impl->offsets.allocator()->allocate();
- break;
+ case InterpolationPolicy::NEAREST_NEIGHBOR:
+ {
+ // Allocate once the configure methods have been called
+ _impl->offsets.allocator()->allocate();
+ break;
+ }
+ case InterpolationPolicy::BILINEAR:
+ {
+ // Allocate once the configure methods have been called
+ _impl->dx.allocator()->allocate();
+ _impl->dy.allocator()->allocate();
+ _impl->offsets.allocator()->allocate();
+ break;
+ }
+ case InterpolationPolicy::AREA:
+ {
+ break;
+ }
+ default:
+ ARM_COMPUTE_ERROR("Unsupported interpolation mode");
}
- case InterpolationPolicy::AREA:
+ }
+ else
+ {
+ if (policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR &&
+ policy_to_use != InterpolationPolicy::AREA)
{
- break;
- }
- default:
ARM_COMPUTE_ERROR("Unsupported interpolation mode");
+ }
}
}
diff --git a/src/runtime/NEON/functions/NESelect.cpp b/src/runtime/NEON/functions/NESelect.cpp
index f8ba9f03ed..55cad2202b 100644
--- a/src/runtime/NEON/functions/NESelect.cpp
+++ b/src/runtime/NEON/functions/NESelect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,12 +24,16 @@
#include "arm_compute/runtime/NEON/functions/NESelect.h"
#include "arm_compute/core/Types.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NESelectKernel.h"
namespace arm_compute
{
void NESelect::configure(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output)
{
+ ARM_COMPUTE_LOG_PARAMS(c, x, y, output);
+
auto k = std::make_unique<NESelectKernel>();
k->configure(c, x, y, output);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NESlice.cpp b/src/runtime/NEON/functions/NESlice.cpp
index 9b08bca38a..12d43adc84 100644
--- a/src/runtime/NEON/functions/NESlice.cpp
+++ b/src/runtime/NEON/functions/NESlice.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,17 +25,23 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEStridedSliceKernel.h"
namespace arm_compute
{
namespace experimental
{
-void NESlice::configure(const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+void NESlice::configure(const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends);
// Get absolute end coordinates
const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
@@ -45,15 +51,16 @@ void NESlice::configure(const ITensorInfo *input, ITensorInfo *output, const Coo
_kernel = std::move(k);
}
-Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status NESlice::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
// Check start dimensions for being non-negative
- ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i)
- {
- return i < 0;
- }));
+ ARM_COMPUTE_RETURN_ERROR_ON(
+ std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; }));
// Get absolute end coordinates
const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
@@ -64,20 +71,22 @@ Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, co
struct NESlice::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<experimental::NESlice> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<experimental::NESlice> op{nullptr};
};
-NESlice::NESlice()
- : _impl(std::make_unique<Impl>())
+NESlice::NESlice() : _impl(std::make_unique<Impl>())
{
}
-NESlice::NESlice(NESlice &&) = default;
+NESlice::NESlice(NESlice &&) = default;
NESlice &NESlice::operator=(NESlice &&) = default;
NESlice::~NESlice() = default;
-Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status NESlice::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends)
{
return experimental::NESlice::validate(input, output, starts, ends);
}
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index bee692c08b..be588c5b52 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,26 +22,26 @@
* SOFTWARE.
*/
#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/MemoryGroup.h"
#include "arm_compute/runtime/Tensor.h"
-#include "src/core/cpu/kernels/CpuSoftmaxKernel.h"
+
#include "src/core/helpers/MemoryHelpers.h"
#include "src/core/helpers/SoftmaxHelpers.h"
-#include "src/runtime/cpu/operators/CpuSoftmax.h"
+#include "src/cpu/operators/CpuSoftmax.h"
namespace arm_compute
{
template <bool IS_LOG>
struct NESoftmaxLayerGeneric<IS_LOG>::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- Tensor max{ nullptr };
- std::unique_ptr<cpu::CpuSoftmaxGeneric<IS_LOG>> op{ nullptr };
- MemoryGroup memory_group{};
- ITensorPack run_pack{};
- WorkspaceData<Tensor> workspace_tensors{};
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuSoftmaxGeneric> op{nullptr};
+ MemoryGroup memory_group{};
+ ITensorPack run_pack{};
+ WorkspaceData<Tensor> workspace_tensors{};
};
template <bool IS_LOG>
@@ -53,9 +53,9 @@ NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(std::shared_ptr<IMemoryMana
template <bool IS_LOG>
NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(NESoftmaxLayerGeneric &&) = default;
-template <bool IS_LOG>
+template <bool IS_LOG>
NESoftmaxLayerGeneric<IS_LOG> &NESoftmaxLayerGeneric<IS_LOG>::operator=(NESoftmaxLayerGeneric &&) = default;
-template <bool IS_LOG>
+template <bool IS_LOG>
NESoftmaxLayerGeneric<IS_LOG>::~NESoftmaxLayerGeneric() = default;
template <bool IS_LOG>
@@ -65,23 +65,24 @@ void NESoftmaxLayerGeneric<IS_LOG>::configure(ITensor *input, ITensor *output, f
_impl->src = input;
_impl->dst = output;
- _impl->op = std::make_unique<cpu::CpuSoftmaxGeneric<IS_LOG>>();
- _impl->op->configure(input->info(), output->info(), beta, axis);
+ _impl->op = std::make_unique<cpu::CpuSoftmaxGeneric>();
+ _impl->op->configure(input->info(), output->info(), beta, axis, IS_LOG);
- _impl->run_pack = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST, _impl->dst } };
+ _impl->run_pack = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}};
_impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
}
template <bool IS_LOG>
-Status NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
+Status
+NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric<IS_LOG>::validate(input, output, beta, axis));
+ ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric::validate(input, output, beta, axis, IS_LOG));
return Status{};
}
template <bool IS_LOG>
-void NESoftmaxLayerGeneric<IS_LOG>::run()
+void NESoftmaxLayerGeneric<IS_LOG>::run()
{
// Acquire all the temporaries
MemoryGroupResourceScope scope_mg(_impl->memory_group);
diff --git a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
index e8a84246fe..556ebdd800 100644
--- a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
+++ b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
@@ -28,24 +28,29 @@
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "arm_compute/runtime/NEON/functions/NEFill.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
namespace arm_compute
{
NESpaceToBatchLayer::~NESpaceToBatchLayer() = default;
-NESpaceToBatchLayer::NESpaceToBatchLayer()
- : _space_to_batch_kernel(), _fill_f(), _has_padding(false)
+NESpaceToBatchLayer::NESpaceToBatchLayer() : _space_to_batch_kernel(), _fill_f(), _has_padding(false)
{
}
-void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output)
+void NESpaceToBatchLayer::configure(const ITensor *input,
+ const ITensor *block_shape,
+ const ITensor *paddings,
+ ITensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
+ ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output);
- if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+ if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
{
_has_padding = true;
_fill_f = std::make_unique<NEFill>();
@@ -55,11 +60,16 @@ void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_s
_space_to_batch_kernel->configure(input, block_shape, paddings, output);
}
-void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output)
+void NESpaceToBatchLayer::configure(const ITensor *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
+ ITensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+ if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
{
_has_padding = true;
_fill_f = std::make_unique<NEFill>();
@@ -69,17 +79,25 @@ void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_
_space_to_batch_kernel->configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
}
-Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status NESpaceToBatchLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *block_shape,
+ const ITensorInfo *paddings,
+ const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
return Status{};
}
-Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status NESpaceToBatchLayer::validate(const ITensorInfo *input,
+ const int block_shape_x,
+ const int block_shape_y,
+ const Size2D &padding_left,
+ const Size2D &padding_right,
const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(
+ NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
return Status{};
}
@@ -87,7 +105,7 @@ Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_s
void NESpaceToBatchLayer::run()
{
// Zero out output only if we have paddings
- if(_has_padding)
+ if (_has_padding)
{
_fill_f->run();
}
diff --git a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
index 1e3776c448..846b619429 100644
--- a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
+++ b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,20 +29,23 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NESpaceToDepthLayerKernel.h"
namespace arm_compute
{
NESpaceToDepthLayer::~NESpaceToDepthLayer() = default;
-NESpaceToDepthLayer::NESpaceToDepthLayer()
- : _space_to_depth_kernel()
+NESpaceToDepthLayer::NESpaceToDepthLayer() : _space_to_depth_kernel()
{
}
void NESpaceToDepthLayer::configure(const ITensor *input, ITensor *output, int32_t block_shape)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_LOG_PARAMS(input, output, block_shape);
+
_space_to_depth_kernel = std::make_unique<NESpaceToDepthLayerKernel>();
_space_to_depth_kernel->configure(input, output, block_shape);
}
diff --git a/src/runtime/NEON/functions/NESplit.cpp b/src/runtime/NEON/functions/NESplit.cpp
index db19bbb824..53b09e9ae5 100644
--- a/src/runtime/NEON/functions/NESplit.cpp
+++ b/src/runtime/NEON/functions/NESplit.cpp
@@ -34,7 +34,7 @@ namespace arm_compute
{
void NESplit::run()
{
- for(unsigned i = 0; i < _num_outputs; ++i)
+ for (unsigned i = 0; i < _num_outputs; ++i)
{
_slice_functions[i].run();
}
diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp
index af5c80d036..2f88ffca2a 100644
--- a/src/runtime/NEON/functions/NEStackLayer.cpp
+++ b/src/runtime/NEON/functions/NEStackLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,6 +30,8 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEStackLayerKernel.h"
namespace arm_compute
@@ -37,25 +39,18 @@ namespace arm_compute
NEStackLayer::~NEStackLayer() = default;
NEStackLayer::NEStackLayer() // NOLINT
- : _input(),
- _stack_kernels(),
- _num_inputs(0)
+ : _stack_kernel(std::make_unique<NEStackLayerKernel>()), _is_prepared(false)
{
}
void NEStackLayer::configure(const std::vector<ITensor *> &input, int axis, ITensor *output)
{
- _num_inputs = input.size();
- _stack_kernels.resize(_num_inputs);
+ ARM_COMPUTE_LOG_PARAMS(input, axis, output);
// Wrap around negative values
const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
- for(unsigned int i = 0; i < _num_inputs; i++)
- {
- _stack_kernels[i] = std::make_unique<NEStackLayerKernel>();
- _stack_kernels[i]->configure(input[i], axis_u, i, _num_inputs, output);
- }
+ _stack_kernel->configure(input, axis_u, output);
}
Status NEStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis, const ITensorInfo *output)
@@ -67,24 +62,20 @@ Status NEStackLayer::validate(const std::vector<ITensorInfo *> &input, int axis,
const size_t rank = input[0]->num_dimensions();
const unsigned int axis_u = wrap_around(axis, static_cast<int>(rank + 1));
- const unsigned int num_inputs = input.size();
-
- for(unsigned int i = 0; i < num_inputs; i++)
- {
- // All the tensors must have the same rank
- ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
- // Validate Kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEStackLayerKernel::validate(input[i], axis_u, i, num_inputs, output));
- }
+ // Validate Kernel
+ ARM_COMPUTE_RETURN_ON_ERROR(NEStackLayerKernel::validate(input, axis_u, output));
return Status{};
}
void NEStackLayer::run()
{
- for(unsigned i = 0; i < _num_inputs; i++)
+ if (!_is_prepared)
{
- NEScheduler::get().schedule(_stack_kernels[i].get(), Window::DimY);
+ _stack_kernel->prepare();
+ _is_prepared = true;
}
+
+ NEScheduler::get().schedule(_stack_kernel.get(), _stack_kernel->get_split_dimension());
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEStridedSlice.cpp b/src/runtime/NEON/functions/NEStridedSlice.cpp
index fffb38c3ca..6a3ac8be05 100644
--- a/src/runtime/NEON/functions/NEStridedSlice.cpp
+++ b/src/runtime/NEON/functions/NEStridedSlice.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,24 +25,38 @@
#include "arm_compute/core/ITensor.h"
#include "arm_compute/core/Types.h"
+
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NEStridedSliceKernel.h"
namespace arm_compute
{
namespace experimental
{
-void NEStridedSlice::configure(const ITensorInfo *input, ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void NEStridedSlice::configure(const ITensorInfo *input,
+ ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+
auto k = std::make_unique<NEStridedSliceKernel>();
k->configure(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
_kernel = std::move(k);
}
-Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status NEStridedSlice::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
return NEStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
}
@@ -50,22 +64,26 @@ Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *out
struct NEStridedSlice::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<experimental::NEStridedSlice> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<experimental::NEStridedSlice> op{nullptr};
};
-NEStridedSlice::NEStridedSlice()
- : _impl(std::make_unique<Impl>())
+NEStridedSlice::NEStridedSlice() : _impl(std::make_unique<Impl>())
{
}
-NEStridedSlice::NEStridedSlice(NEStridedSlice &&) = default;
+NEStridedSlice::NEStridedSlice(NEStridedSlice &&) = default;
NEStridedSlice &NEStridedSlice::operator=(NEStridedSlice &&) = default;
NEStridedSlice::~NEStridedSlice() = default;
-void NEStridedSlice::configure(const ITensor *input, ITensor *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void NEStridedSlice::configure(const ITensor *input,
+ ITensor *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
_impl->src = input;
_impl->dst = output;
@@ -81,10 +99,16 @@ void NEStridedSlice::run()
_impl->op->run(pack);
}
-Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
- const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
- int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status NEStridedSlice::validate(const ITensorInfo *input,
+ const ITensorInfo *output,
+ const Coordinates &starts,
+ const Coordinates &ends,
+ const BiStrides &strides,
+ int32_t begin_mask,
+ int32_t end_mask,
+ int32_t shrink_axis_mask)
{
- return experimental::NEStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+ return experimental::NEStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask,
+ shrink_axis_mask);
}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NETile.cpp b/src/runtime/NEON/functions/NETile.cpp
index 088816eb95..d10b1c8e95 100644
--- a/src/runtime/NEON/functions/NETile.cpp
+++ b/src/runtime/NEON/functions/NETile.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,12 +23,15 @@
*/
#include "arm_compute/runtime/NEON/functions/NETile.h"
+#include "src/common/utils/Log.h"
#include "src/core/NEON/kernels/NETileKernel.h"
namespace arm_compute
{
void NETile::configure(const ITensor *input, ITensor *output, const Multiples &multiples)
{
+ ARM_COMPUTE_LOG_PARAMS(input, output, multiples);
+
auto k = std::make_unique<NETileKernel>();
k->configure(input, output, multiples);
_kernel = std::move(k);
diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp
index 3b3023f3b3..0144a85e8c 100644
--- a/src/runtime/NEON/functions/NETranspose.cpp
+++ b/src/runtime/NEON/functions/NETranspose.cpp
@@ -24,19 +24,20 @@
#include "arm_compute/runtime/NEON/functions/NETranspose.h"
#include "arm_compute/core/Validate.h"
-#include "src/runtime/cpu/operators/CpuTranspose.h"
+
+#include "src/common/utils/Log.h"
+#include "src/cpu/operators/CpuTranspose.h"
namespace arm_compute
{
struct NETranspose::Impl
{
- const ITensor *src{ nullptr };
- ITensor *dst{ nullptr };
- std::unique_ptr<cpu::CpuTranspose> op{ nullptr };
+ const ITensor *src{nullptr};
+ ITensor *dst{nullptr};
+ std::unique_ptr<cpu::CpuTranspose> op{nullptr};
};
-NETranspose::NETranspose()
- : _impl(std::make_unique<Impl>())
+NETranspose::NETranspose() : _impl(std::make_unique<Impl>())
{
}
@@ -45,6 +46,7 @@ NETranspose::~NETranspose() = default;
void NETranspose::configure(const ITensor *input, ITensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_LOG_PARAMS(input, output);
_impl->src = input;
_impl->dst = output;
diff --git a/src/runtime/NEON/functions/NEUnstack.cpp b/src/runtime/NEON/functions/NEUnstack.cpp
index 50596dbc0a..2f7ed2bb1f 100644
--- a/src/runtime/NEON/functions/NEUnstack.cpp
+++ b/src/runtime/NEON/functions/NEUnstack.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,6 +29,8 @@
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/common/utils/Log.h"
+
namespace arm_compute
{
namespace
@@ -38,13 +40,15 @@ inline unsigned int wrap_axis(int axis, const ITensorInfo *const tensor)
return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
}
-inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+inline void setup_slice_coordinates_and_mask(Coordinates &slice_start,
+ int32_t &slice_end_mask,
+ const unsigned int input_num_dimensions)
{
// Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
Coordinates slice_end;
slice_start.set_num_dimensions(input_num_dimensions);
slice_end.set_num_dimensions(input_num_dimensions);
- for(size_t k = 0; k < input_num_dimensions; ++k)
+ for (size_t k = 0; k < input_num_dimensions; ++k)
{
slice_start.set(k, 0);
slice_end.set(k, -1);
@@ -54,22 +58,23 @@ inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &
} // namespace
NEUnstack::NEUnstack() // NOLINT
- : _num_slices(0),
- _strided_slice_vector()
+ : _num_slices(0), _strided_slice_vector()
{
}
void NEUnstack::configure(const ITensor *input, const std::vector<ITensor *> &output_vector, int axis)
{
std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
- std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ITensor * t)
- {
- ARM_COMPUTE_ERROR_ON_NULLPTR(t);
- return t->info();
- });
+ std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(),
+ [](ITensor *t)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+ return t->info();
+ });
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
ARM_COMPUTE_ERROR_THROW_ON(NEUnstack::validate(input->info(), outputs_vector_info, axis));
+ ARM_COMPUTE_LOG_PARAMS(input, output_vector, axis);
// Wrap around negative values
const unsigned int axis_u = wrap_axis(axis, input->info());
@@ -79,11 +84,12 @@ void NEUnstack::configure(const ITensor *input, const std::vector<ITensor *> &ou
Coordinates slice_start;
int32_t slice_end_mask;
setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
- for(unsigned int slice = 0; slice < _num_slices; ++slice)
+ for (unsigned int slice = 0; slice < _num_slices; ++slice)
{
// Adjusts start and end coordinates to take a 2D slice at a time
slice_start.set(axis_u, slice);
- _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+ _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0,
+ slice_end_mask, (1 << axis_u));
}
}
@@ -100,18 +106,20 @@ Status NEUnstack::validate(const ITensorInfo *input, const std::vector<ITensorIn
Coordinates slice_start;
int32_t slice_end_mask;
- for(size_t k = 0; k < num_slices; ++k)
+ for (size_t k = 0; k < num_slices; ++k)
{
slice_start.set(wrap_axis(axis, input), k);
setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
- ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(),
+ BiStrides(), 0, slice_end_mask,
+ (1 << wrap_axis(axis, input))));
}
return Status{};
}
void NEUnstack::run()
{
- for(unsigned i = 0; i < _num_slices; ++i)
+ for (unsigned i = 0; i < _num_slices; ++i)
{
_strided_slice_vector[i].run();
}
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index 0bf1738bec..7334be8456 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2022, 2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,759 +24,93 @@
#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensorPack.h"
#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
-#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+#include "arm_compute/core/Validate.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/MemoryHelpers.h"
#include "src/core/NEON/kernels/convolution/common/utils.hpp"
-#include "src/core/NEON/kernels/convolution/winograd/winograd.hpp"
+#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
+#include "src/cpu/operators/CpuWinogradConv2d.h"
namespace arm_compute
{
-namespace
-{
-inline Status validate_kernel_3x3(const Size2D input_dims, const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
- const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-
- if(input->data_type() == DataType::F32)
- {
- if(input_dims.width > 4 && input_dims.height > 4)
- {
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>::validate(input, input0, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>::validate(weights, input1, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>::validate(batched_mm_output, biases, output, winograd_info)));
- }
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>::validate(input, input0, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>::validate(weights, input1, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>::validate(batched_mm_output, biases, output, winograd_info)));
- }
- }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- else if(input->data_type() == DataType::F16)
- {
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<__fp16, 4, 4, 3, 3>::validate(input, input0, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<__fp16, 4, 4, 3, 3>::validate(weights, input1, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<__fp16, 4, 4, 3, 3>::validate(batched_mm_output, biases, output, winograd_info)));
- }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
- if(act_info.enabled())
- {
- NEActivationLayer::validate(output, nullptr, act_info);
- }
- return Status{};
-}
-
-inline Status validate_kernel_5x5(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
- const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>::validate(input, input0, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>::validate(weights, input1, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>::validate(batched_mm_output, biases, output, winograd_info)));
- if(act_info.enabled())
- {
- NEActivationLayer::validate(output, nullptr, act_info);
- }
- return Status{};
-}
+using namespace arm_compute::experimental;
-inline Status validate_kernel_3x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
- const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+struct NEWinogradConvolutionLayer::Impl
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 1, 6, 1, 3>::validate(input, input0, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 1, 6, 1, 3>::validate(weights, input1, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 6, 1, 3>::validate(batched_mm_output, biases, output, winograd_info)));
- if(act_info.enabled())
- {
- NEActivationLayer::validate(output, nullptr, act_info);
- }
- return Status{};
-}
-
-inline Status validate_kernel_1x3(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
- const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 6, 1, 3, 1>::validate(input, input0, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 6, 1, 3, 1>::validate(weights, input1, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 6, 1, 3, 1>::validate(batched_mm_output, biases, output, winograd_info)));
-
- if(act_info.enabled())
- {
- NEActivationLayer::validate(output, nullptr, act_info);
- }
- return Status{};
-}
-
-inline Status validate_kernel_5x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
- const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 1, 4, 1, 5>::validate(input, input0, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 1, 4, 1, 5>::validate(weights, input1, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 4, 1, 5>::validate(batched_mm_output, biases, output, winograd_info)));
- if(act_info.enabled())
- {
- NEActivationLayer::validate(output, nullptr, act_info);
- }
- return Status{};
-}
-inline Status validate_kernel_1x5(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
- const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 4, 1, 5, 1>::validate(input, input0, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 4, 1, 5, 1>::validate(weights, input1, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 1, 5, 1>::validate(batched_mm_output, biases, output, winograd_info)));
- if(act_info.enabled())
- {
- NEActivationLayer::validate(output, nullptr, act_info);
- }
- return Status{};
-}
-
-inline Status validate_kernel_7x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
- const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 1, 2, 1, 7>::validate(input, input0, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 1, 2, 1, 7>::validate(weights, input1, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 2, 1, 7>::validate(batched_mm_output, biases, output, winograd_info)));
- if(act_info.enabled())
- {
- NEActivationLayer::validate(output, nullptr, act_info);
- }
- return Status{};
-}
-
-inline Status validate_kernel_1x7(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
- const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 2, 1, 7, 1>::validate(input, input0, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 2, 1, 7, 1>::validate(weights, input1, winograd_info)));
- ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 1, 7, 1>::validate(batched_mm_output, biases, output, winograd_info)));
-
- if(act_info.enabled())
- {
- NEActivationLayer::validate(output, nullptr, act_info);
- }
- return Status{};
-}
-
-inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input)
-{
- const DataLayout data_layout = input->info()->data_layout();
- const int in_width = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
- const int in_height = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
- const int in_channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
- const int in_batches = input->info()->dimension(3);
-
- return Tensor4DShape{ in_batches, in_height, in_width, in_channels };
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
-{
- ARM_COMPUTE_UNUSED(output);
- ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides.");
- if(biases != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- }
- return INEWinogradLayerTransformWeightsKernel::validate(input, weights);
-}
-
-Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataType data_type)
-{
- Size2D output_tile = Size2D{};
- if(kernel_dims == Size2D(3U, 3U))
- {
- output_tile = (input_dims.width <= 4 || input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
- if(data_type == DataType::F16)
- {
- output_tile = Size2D(4U, 4U);
- }
- }
- else if(kernel_dims == Size2D(5U, 5U))
- {
- output_tile = Size2D(2U, 2U);
- }
- else if(kernel_dims == Size2D(1U, 3U))
- {
- output_tile = Size2D(1U, 6U);
- }
- else if(kernel_dims == Size2D(3U, 1U))
- {
- output_tile = Size2D(6U, 1U);
- }
- else if(kernel_dims == Size2D(1U, 5U))
- {
- output_tile = Size2D(1U, 4U);
- }
- else if(kernel_dims == Size2D(5U, 1U))
- {
- output_tile = Size2D(4U, 1U);
- }
- else if(kernel_dims == Size2D(7U, 1U))
- {
- output_tile = Size2D(2U, 1U);
- }
- else if(kernel_dims == Size2D(1U, 7U))
- {
- output_tile = Size2D(1U, 2U);
- }
- return output_tile;
-}
-
-bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size, DataType data_type)
-{
- // Check if we want to configure a Winograd configuration which requires fast math
- using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
-
- const std::vector<WinogradConfiguration> fast_math_winograd_f16 =
- {
- WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3))
- };
-
- const std::vector<WinogradConfiguration> fast_math_winograd_f32 =
- {
- WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(5, 5)),
- WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
- };
-
- auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
- std::pair<int, int>(kernel_size.width, kernel_size.height));
-
- switch(data_type)
- {
- case DataType::F16:
- return std::find(fast_math_winograd_f16.begin(), fast_math_winograd_f16.end(), p) != fast_math_winograd_f16.end();
- case DataType::F32:
- return std::find(fast_math_winograd_f32.begin(), fast_math_winograd_f32.end(), p) != fast_math_winograd_f32.end();
- default:
- return false;
- }
-}
-
-inline bool fuse_function_supported(const ActivationLayerInfo &act_info)
-{
- return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
-}
-
-arm_gemm::Activation arm_gemm_activation_from_acl_activation(const ActivationLayerInfo &act_info)
-{
- switch(act_info.activation())
- {
- case ActivationLayerInfo::ActivationFunction::RELU:
- {
- return arm_gemm::Activation(arm_gemm::Activation::Type::ReLU, act_info.a(), act_info.b());
- }
- case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
- {
- return arm_gemm::Activation(arm_gemm::Activation::Type::BoundedReLU, act_info.a(), act_info.b());
- }
- default:
- {
- return arm_gemm::Activation(arm_gemm::Activation::Type::None);
- }
- }
-}
-} //namespace
+ MemoryGroup memory_group{};
+ std::unique_ptr<cpu::CpuWinogradConv2d> op{nullptr};
+ ITensorPack run_pack{};
+ ITensorPack prep_pack{};
+ WorkspaceData<Tensor> workspace{};
+ experimental::MemoryRequirements aux_mem_req{};
+ const ITensor *original_weights{nullptr};
+ bool is_prepared{false};
+ bool is_activationlayer_enabled{false};
+ DataLayout data_layout{};
+};
NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
- : _memory_group(memory_manager), _gemm_function(memory_manager), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _activationlayer_function(),
- _permute_input(), _permute_weights(), _permute_output(), _input_transformed(), _output_transformed(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(),
- _weights_hwio(), _input(), _weights(), _output(), _is_prepared(false), _is_activationlayer_enabled(false), _data_layout()
+ : _impl(std::make_unique<Impl>())
{
+ _impl->memory_group = MemoryGroup(memory_manager);
}
-void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
- bool enable_fast_math)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info));
-
- // Get indices for the width and height
- _data_layout = input->info()->data_layout();
- const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
- const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-
- const Size2D input_dims = Size2D(input->info()->dimension(width_idx), input->info()->dimension(height_idx));
- const Size2D kernel_size = Size2D(weights->info()->dimension(width_idx), weights->info()->dimension(height_idx));
- const DataType data_type = input->info()->data_type();
- const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, data_type);
-
- // Check if the Winograd configuration requires fast math
- if(!enable_fast_math)
- {
- ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size, data_type),
- "This Winograd configuration requires enable_fast_math=true");
- }
-
- _weights = weights;
- _input = input;
- _output = output;
- _is_prepared = false;
-
- int n_gemms = 1;
- int N_BLOCK = 1; // Size of block used by GEMM.
-
- std::unique_ptr<INEWinogradLayerTransformInputKernel> transform_input_kernel;
- std::unique_ptr<INEWinogradLayerTransformWeightsKernel> transform_weights_kernel;
- std::unique_ptr<INEWinogradLayerTransformOutputKernel> transform_output_kernel;
-
- if(data_type == DataType::F32)
- {
- if(kernel_size == Size2D(3, 3))
- {
- if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4)
- {
- using config = NEWinogradLayerConfiguration<float, float, 4, 4, 3, 3>;
- transform_input_kernel = std::make_unique<config::TransformInputKernel>();
- transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
- transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
- n_gemms = config::WinogradBase::N_GEMMS;
- N_BLOCK = config::WinogradConv::N_BLOCK;
- }
- else
- {
- using config = NEWinogradLayerConfiguration<float, float, 2, 2, 3, 3>;
- transform_input_kernel = std::make_unique<config::TransformInputKernel>();
- transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
- transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
- n_gemms = config::WinogradBase::N_GEMMS;
- N_BLOCK = config::WinogradConv::N_BLOCK;
- }
- }
- else if(kernel_size == Size2D(5, 5))
- {
- using config = NEWinogradLayerConfiguration<float, float, 2, 2, 5, 5>;
- transform_input_kernel = std::make_unique<config::TransformInputKernel>();
- transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
- transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
- n_gemms = config::WinogradBase::N_GEMMS;
- N_BLOCK = config::WinogradConv::N_BLOCK;
- }
- else if(kernel_size == Size2D(1, 3))
- {
- using config = NEWinogradLayerConfiguration<float, float, 6, 1, 3, 1>;
- transform_input_kernel = std::make_unique<config::TransformInputKernel>();
- transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
- transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
- n_gemms = config::WinogradBase::N_GEMMS;
- N_BLOCK = config::WinogradConv::N_BLOCK;
- }
- else if(kernel_size == Size2D(3, 1))
- {
- using config = NEWinogradLayerConfiguration<float, float, 1, 6, 1, 3>;
- transform_input_kernel = std::make_unique<config::TransformInputKernel>();
- transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
- transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
- n_gemms = config::WinogradBase::N_GEMMS;
- N_BLOCK = config::WinogradConv::N_BLOCK;
- }
- else if(kernel_size == Size2D(1, 5))
- {
- using config = NEWinogradLayerConfiguration<float, float, 4, 1, 5, 1>;
- transform_input_kernel = std::make_unique<config::TransformInputKernel>();
- transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
- transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
- n_gemms = config::WinogradBase::N_GEMMS;
- N_BLOCK = config::WinogradConv::N_BLOCK;
- }
- else if(kernel_size == Size2D(5, 1))
- {
- using config = NEWinogradLayerConfiguration<float, float, 1, 4, 1, 5>;
- transform_input_kernel = std::make_unique<config::TransformInputKernel>();
- transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
- transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
- n_gemms = config::WinogradBase::N_GEMMS;
- N_BLOCK = config::WinogradConv::N_BLOCK;
- }
- else if(kernel_size == Size2D(1, 7))
- {
- using config = NEWinogradLayerConfiguration<float, float, 2, 1, 7, 1>;
- transform_input_kernel = std::make_unique<config::TransformInputKernel>();
- transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
- transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
- n_gemms = config::WinogradBase::N_GEMMS;
- N_BLOCK = config::WinogradConv::N_BLOCK;
- }
- else if(kernel_size == Size2D(7, 1))
- {
- using config = NEWinogradLayerConfiguration<float, float, 1, 2, 1, 7>;
- transform_input_kernel = std::make_unique<config::TransformInputKernel>();
- transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
- transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
- n_gemms = config::WinogradBase::N_GEMMS;
- N_BLOCK = config::WinogradConv::N_BLOCK;
- }
- else
- {
- ARM_COMPUTE_ERROR("Not supported.");
- }
- }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- else if(data_type == DataType::F16)
- {
- if(kernel_size == Size2D(3, 3))
- {
- using config = NEWinogradLayerConfiguration<__fp16, __fp16, 4, 4, 3, 3>;
- transform_input_kernel = std::make_unique<config::TransformInputKernel>();
- transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
- transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
- n_gemms = config::WinogradBase::N_GEMMS;
- N_BLOCK = config::WinogradConv::N_BLOCK;
- }
- else
- {
- ARM_COMPUTE_ERROR("Not supported.");
- }
- }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- else
- {
- ARM_COMPUTE_ERROR("Not supported.");
- }
-
- const PaddingType use_padding_type = (conv_info.pad_top() != 0u || conv_info.pad_left() != 0) ? PADDING_SAME : PADDING_VALID;
- const bool use_same_padding = use_padding_type == PADDING_SAME;
-
- // Get convolved dimensions
- const int in_channels = input->info()->dimension(channel_idx);
- const int out_channels = output->info()->dimension(channel_idx);
-
- const Tensor4DShape in_shape(internal_get_input_shape(input));
- const size_t data_type_size = input->info()->element_size();
- // Get the memory required to instantiate a new Winograd operator.
- constexpr size_t storage_alignment = 64;
-
- // Kernel Storage
- const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels,
- in_channels)
- * data_type_size;
-
- // Input storage
- const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols,
- use_same_padding)
- * data_type_size;
-
- // Output storage
- const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels) * data_type_size;
- const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(out_channels, in_channels);
- const int output_matrix_stride = transform_output_kernel->get_matrix_stride(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels);
- const auto output_shape = transform_output_kernel->get_output_shape(in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME);
- const int input_matrix_stride = transform_input_kernel->get_matrix_stride(in_shape.n_batches, in_channels, in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME);
-
- // Configure GEMM
- const int tile_rows = iceildiv(output_shape.first, output_tile.height);
- const int tile_cols = iceildiv(output_shape.second, output_tile.width);
- const int m = in_shape.n_batches * tile_rows * tile_cols;
- const int k = in_shape.n_channels;
- const int n = out_channels;
- const int kernel_matrix_row_stride = roundup(out_channels, N_BLOCK);
- const int output_matrix_row_stride = kernel_matrix_row_stride;
-
- TensorShape a_shape(k, m, 1, n_gemms);
- Strides a_strides(data_type_size);
- a_strides.set(1, a_strides[0] * k);
- //a_strides.set(2, data_type_size * input_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0.
- a_strides.set(2, 0);
- a_strides.set(3, data_type_size * input_matrix_stride);
-
- TensorShape b_shape(n, k, n_gemms);
- Strides b_strides(data_type_size);
- b_strides.set(1, data_type_size * kernel_matrix_row_stride);
- b_strides.set(2, data_type_size * kernel_matrix_stride);
-
- TensorShape d_shape(n, m, 1, n_gemms);
- Strides d_strides(data_type_size);
- d_strides.set(1, data_type_size * output_matrix_row_stride);
- //d_strides.set(2, data_type_size * output_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0.
- d_strides.set(2, 0);
- d_strides.set(3, data_type_size * output_matrix_stride);
-
- TensorInfo a_info{};
- TensorInfo b_info{};
- TensorInfo d_info{};
- a_info.init(a_shape, 1, data_type, a_strides, 0, input_storage_size);
- b_info.init(b_shape, 1, data_type, b_strides, 0, kernel_storage_size);
- d_info.init(d_shape, 1, data_type, d_strides, 0, output_storage_size);
-
- _input_transformed.allocator()->init(a_info, storage_alignment);
- _kernel_storage.allocator()->init(b_info, storage_alignment);
- _output_transformed.allocator()->init(d_info, storage_alignment);
-
- // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
- TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
- _output->info()->dimension(1), _output->info()->dimension(3)),
- 1, _output->info()->data_type());
- _output_nhwc.allocator()->init(info);
-
- const ITensor *input_to_use = _input;
- ITensor *output_to_use = _output;
- PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U);
- const unsigned int max_num_threads = NEScheduler::get().num_threads();
-
- // Configure the kernel to transform the input tensor from NCHW -> NHWC
- if(_data_layout == DataLayout::NCHW)
- {
- _memory_group.manage(&_input_nhwc);
- _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
- input_to_use = &_input_nhwc;
- weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U);
- }
-
- // Configure input transform kernel
- _memory_group.manage(&_input_transformed);
- _memory_group.manage(&_input_workspace);
- transform_input_kernel->configure(input_to_use, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
- &_input_transformed, input_matrix_stride, &_input_workspace);
- const size_t input_workspace_size = transform_input_kernel->get_working_space_size(max_num_threads);
- TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, _input->info()->data_type());
- _input_workspace.allocator()->init(input_workspace_info);
- _input_workspace.allocator()->allocate();
- if(_data_layout == DataLayout::NCHW)
- {
- _input_nhwc.allocator()->allocate();
- }
-
- // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
- _permute_weights.configure(weights, &_weights_hwio, weights_permutation_vector);
- transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
-
- // Configure GEMM function
- _memory_group.manage(&_output_transformed);
- _gemm_function.configure(&_input_transformed, &_kernel_storage, nullptr, &_output_transformed, 1.0f, 0.f);
- _input_transformed.allocator()->allocate();
-
- // Configure output transform function
- // The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
- if(_data_layout == DataLayout::NCHW)
- {
- _memory_group.manage(&_output_nhwc);
- output_to_use = &_output_nhwc;
- }
- const arm_gemm::Activation activation = arm_gemm_activation_from_acl_activation(act_info);
-
- transform_output_kernel->configure(biases,
- &_output_transformed,
- output_matrix_stride,
- output_to_use,
- in_shape.n_batches,
- output_shape.first,
- output_shape.second,
- out_channels,
- &_output_workspace,
- activation);
+NEWinogradConvolutionLayer::~NEWinogradConvolutionLayer() = default;
- const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads);
- TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, _output->info()->data_type());
- _output_workspace.allocator()->init(output_workspace_info);
- _output_workspace.allocator()->allocate();
- _output_transformed.allocator()->allocate();
-
- // Reorder the convoluted output to ACL's ordering NCHW
- if(_data_layout == DataLayout::NCHW)
- {
- _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
- _output_nhwc.allocator()->allocate();
- }
-
- _transform_input_kernel = std::move(transform_input_kernel);
- _transform_weights_kernel = std::move(transform_weights_kernel);
- _transform_output_kernel = std::move(transform_output_kernel);
+void NEWinogradConvolutionLayer::configure(const ITensor *input,
+ const ITensor *weights,
+ const ITensor *biases,
+ ITensor *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
+{
+ _impl->original_weights = weights;
+ _impl->op = std::make_unique<cpu::CpuWinogradConv2d>();
+ _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+ conv_info, act_info, enable_fast_math);
- //Configure Activation Layer
- _is_activationlayer_enabled = act_info.enabled() && !fuse_function_supported(act_info);
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.configure(_output, nullptr, act_info);
- }
+ _impl->aux_mem_req = _impl->op->workspace();
+ _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+ _impl->prep_pack = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}};
+ _impl->workspace =
+ manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
}
void NEWinogradConvolutionLayer::run()
{
prepare();
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- if(_data_layout == DataLayout::NCHW)
- {
- //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
- _permute_input.run();
- }
-
- // Transform input tensor to the winograd domain
- NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX);
-
- //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
- _gemm_function.run();
-
- // Transform output tensor to the spatial domain
- NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
-
- if(_data_layout == DataLayout::NCHW)
- {
- // Reorder the convoluted output to ACL's ordering NCHW
- _permute_output.run();
- }
-
- if(_is_activationlayer_enabled)
- {
- _activationlayer_function.run();
- }
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
+ _impl->op->run(_impl->run_pack);
}
-Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input,
+ const ITensorInfo *weights,
+ const ITensorInfo *biases,
+ const ITensorInfo *output,
+ const PadStrideInfo &conv_info,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info));
-
- // Get indices for the width and height
- const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-
- // Input shape, kernel size and output tile
- const Size2D input_dims = Size2D(input->dimension(idx_width), input->dimension(idx_height));
- const Size2D kernel_size = Size2D(weights->dimension(idx_width), weights->dimension(idx_height));
- const DataType data_type = input->data_type();
- const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, data_type);
-
- // Check if the Winograd configuration requires fast math
- if(!enable_fast_math)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size, data_type),
- "This Winograd configuration requires enable_fast_math=true");
- }
-
- const WinogradInfo winograd_info = WinogradInfo(output_tile,
- kernel_size,
- input_dims,
- conv_info,
- input->data_layout());
-
- // Validate input transform
- const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
- const TensorInfo input0 = input->clone()->set_tensor_shape(input0_shape);
- // Validate filter transform
- const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
- const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
- // Validate batched matrix multiply
- TensorShape batched_mm_output_shape = input0.tensor_shape();
- batched_mm_output_shape[0] = input1.tensor_shape()[0];
- const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape);
-
- if(kernel_size == Size2D(3, 3))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported");
- return validate_kernel_3x3(input_dims, input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
- }
- else if(kernel_size == Size2D(5, 5))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported");
- return validate_kernel_5x5(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
- }
- if(kernel_size == Size2D(3, 1))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
- return validate_kernel_3x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
- }
- else if(kernel_size == Size2D(1, 3))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
- return validate_kernel_1x3(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
- }
- else if(kernel_size == Size2D(5, 1))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
- return validate_kernel_5x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
- }
- else if(kernel_size == Size2D(1, 5))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
- return validate_kernel_1x5(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
- }
- else if(kernel_size == Size2D(7, 1))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 3, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 3, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
- return validate_kernel_7x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
- }
- else if(kernel_size == Size2D(1, 7))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 3, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 3, "Only SAME or VALID padding supported");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
- return validate_kernel_1x7(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_MSG("Kernel shape not supported");
- }
+ return cpu::CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math);
}
void NEWinogradConvolutionLayer::prepare()
{
- if(!_is_prepared)
+ if (!_impl->is_prepared)
{
- // Permute weights
- _weights_hwio.allocator()->allocate();
- _permute_weights.run();
- _weights->mark_as_unused();
-
- // Transform weights
- _kernel_storage.allocator()->allocate();
- NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
- _weights_hwio.allocator()->free();
+ _impl->op->prepare(_impl->prep_pack);
+ _impl->original_weights->mark_as_unused();
- _gemm_function.prepare();
- if(!_kernel_storage.is_used())
- {
- _kernel_storage.allocator()->free();
- }
+ // Release temporary tensors that are only used in prepare stage
+ release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace);
- _is_prepared = true;
+ _impl->is_prepared = true;
}
}
} // namespace arm_compute