diff options
Diffstat (limited to 'arm_compute/runtime/NEON')
93 files changed, 3084 insertions, 1949 deletions
diff --git a/arm_compute/runtime/NEON/INEOperator.h b/arm_compute/runtime/NEON/INEOperator.h index b21dc49b20..7971168d24 100644 --- a/arm_compute/runtime/NEON/INEOperator.h +++ b/arm_compute/runtime/NEON/INEOperator.h @@ -24,20 +24,22 @@ #ifndef ARM_COMPUTE_INEOPERATOR_H #define ARM_COMPUTE_INEOPERATOR_H -#include "../../core/ITensor.h" #include "arm_compute/runtime/IOperator.h" #include "arm_compute/runtime/IRuntimeContext.h" #include "arm_compute/runtime/Types.h" +#include "../../core/ITensor.h" #include <memory> namespace arm_compute { class ICPPKernel; +class Window; + using INEKernel = ICPPKernel; namespace experimental { -/** Basic interface for functions which have a single async Neon kernel */ +/** Basic interface for functions which have a single async CPU kernel */ class INEOperator : public IOperator { public: @@ -58,11 +60,13 @@ public: ~INEOperator(); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; MemoryRequirements workspace() const override; protected: + void run(ITensorPack &tensors, const Window &window); + std::unique_ptr<INEKernel> _kernel; IRuntimeContext *_ctx; MemoryRequirements _workspace; diff --git a/arm_compute/runtime/NEON/INESimpleFunction.h b/arm_compute/runtime/NEON/INESimpleFunction.h index 8c7cf6512c..f783a836ee 100644 --- a/arm_compute/runtime/NEON/INESimpleFunction.h +++ b/arm_compute/runtime/NEON/INESimpleFunction.h @@ -33,7 +33,7 @@ namespace arm_compute class ICPPKernel; class NEFillBorderKernel; using INEKernel = ICPPKernel; -/** Basic interface for functions which have a single Neon kernel */ +/** Basic interface for functions which have a single CPU kernel */ class INESimpleFunction : public IFunction { public: @@ -57,5 +57,5 @@ protected: std::unique_ptr<INEKernel> _kernel; /**< Kernel to run */ std::unique_ptr<NEFillBorderKernel> _border_handler; /**< Kernel to handle image borders */ }; -} +} // namespace arm_compute #endif /*ARM_COMPUTE_INESIMPLEFUNCTION_H */ diff --git a/arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h b/arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h index 52bd5f333b..dc4bac17e4 100644 --- a/arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h +++ b/arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h @@ -33,7 +33,7 @@ namespace arm_compute { class ICPPKernel; using INEKernel = ICPPKernel; -/** Basic interface for functions which have a single Neon kernel and no border */ +/** Basic interface for functions which have a single CPU kernel and no border */ class INESimpleFunctionNoBorder : public IFunction { public: diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h index 863a8a6412..cc4d303202 100644 --- a/arm_compute/runtime/NEON/NEFunctions.h +++ b/arm_compute/runtime/NEON/NEFunctions.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021 Arm Limited. + * Copyright (c) 2016-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,10 +21,11 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NEFUNCTIONS_H -#define ARM_COMPUTE_NEFUNCTIONS_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_NEFUNCTIONS_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_NEFUNCTIONS_H #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" +#include "arm_compute/runtime/NEON/functions/NEAddMulAdd.h" #include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" @@ -38,6 +39,7 @@ #include "arm_compute/runtime/NEON/functions/NECast.h" #include "arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h" #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h" +#include "arm_compute/runtime/NEON/functions/NEConv3D.h" #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h" #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h" #include "arm_compute/runtime/NEON/functions/NECopy.h" @@ -60,39 +62,41 @@ #include "arm_compute/runtime/NEON/functions/NEFloor.h" #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" #include "arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h" +#include "arm_compute/runtime/NEON/functions/NEGather.h" #include "arm_compute/runtime/NEON/functions/NEGEMM.h" #include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h" #include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" -#include "arm_compute/runtime/NEON/functions/NEGather.h" #include "arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h" #include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h" #include "arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h" +#include "arm_compute/runtime/NEON/functions/NELogical.h" #include "arm_compute/runtime/NEON/functions/NELSTMLayer.h" #include "arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h" -#include "arm_compute/runtime/NEON/functions/NELogical.h" +#include "arm_compute/runtime/NEON/functions/NEMatMul.h" #include "arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h" #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h" #include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h" -#include "arm_compute/runtime/NEON/functions/NEPReluLayer.h" #include "arm_compute/runtime/NEON/functions/NEPadLayer.h" #include "arm_compute/runtime/NEON/functions/NEPermute.h" #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h" +#include "arm_compute/runtime/NEON/functions/NEPooling3dLayer.h" #include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h" +#include "arm_compute/runtime/NEON/functions/NEPReluLayer.h" #include "arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h" #include "arm_compute/runtime/NEON/functions/NEQLSTMLayer.h" #include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h" -#include "arm_compute/runtime/NEON/functions/NERNNLayer.h" -#include "arm_compute/runtime/NEON/functions/NEROIAlignLayer.h" -#include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h" #include "arm_compute/runtime/NEON/functions/NERange.h" #include "arm_compute/runtime/NEON/functions/NEReduceMean.h" #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" -#include "arm_compute/runtime/NEON/functions/NERemap.h" +#include "arm_compute/runtime/NEON/functions/NEReorderLayer.h" #include "arm_compute/runtime/NEON/functions/NEReorgLayer.h" #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" #include "arm_compute/runtime/NEON/functions/NEReverse.h" +#include "arm_compute/runtime/NEON/functions/NERNNLayer.h" +#include "arm_compute/runtime/NEON/functions/NEROIAlignLayer.h" +#include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h" #include "arm_compute/runtime/NEON/functions/NEScale.h" #include "arm_compute/runtime/NEON/functions/NESelect.h" #include "arm_compute/runtime/NEON/functions/NESlice.h" @@ -107,4 +111,4 @@ #include "arm_compute/runtime/NEON/functions/NEUnstack.h" #include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h" -#endif /* ARM_COMPUTE_NEFUNCTIONS_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_NEFUNCTIONS_H diff --git a/arm_compute/runtime/NEON/NEScheduler.h b/arm_compute/runtime/NEON/NEScheduler.h index 542142a30a..613f44cc52 100644 --- a/arm_compute/runtime/NEON/NEScheduler.h +++ b/arm_compute/runtime/NEON/NEScheduler.h @@ -28,7 +28,7 @@ namespace arm_compute { -/** Neon Scheduler */ +/** CPU Scheduler */ using NEScheduler = Scheduler; -} +} // namespace arm_compute #endif /*ARM_COMPUTE_NESCHEDULER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEActivationLayer.h b/arm_compute/runtime/NEON/functions/NEActivationLayer.h index ffda8406aa..5584fdc783 100644 --- a/arm_compute/runtime/NEON/functions/NEActivationLayer.h +++ b/arm_compute/runtime/NEON/functions/NEActivationLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,9 +24,9 @@ #ifndef ARM_COMPUTE_NEACTIVATIONLAYER_H #define ARM_COMPUTE_NEACTIVATIONLAYER_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" +#include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IRuntimeContext.h" #include <memory> @@ -62,6 +62,18 @@ public: /** [NEActivationLayer snippet] **/ /** Set the input and output tensor. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |QSYMM16 |QSYMM16 | + * |F16 |F16 | + * |F32 |F32 | + * * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place * * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result @@ -89,5 +101,5 @@ private: struct Impl; std::unique_ptr<Impl> _impl; }; -} // namespace arm_computes +} // namespace arm_compute #endif /* ARM_COMPUTE_NEACTIVATIONLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEAddMulAdd.h b/arm_compute/runtime/NEON/functions/NEAddMulAdd.h new file mode 100644 index 0000000000..6c65c055dd --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEAddMulAdd.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEADDMULADD +#define ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEADDMULADD + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" + +#include <memory> + +namespace arm_compute +{ +class ITensor; +class ITensorInfo; +class ActivationLayerInfo; + +/** Function to compute Add+Mul+Add fused operation */ +class NEAddMulAdd : public IFunction +{ +public: + /** Constructor */ + NEAddMulAdd(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEAddMulAdd(const NEAddMulAdd &) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + NEAddMulAdd(NEAddMulAdd &&) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEAddMulAdd &operator=(const NEAddMulAdd &) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + NEAddMulAdd &operator=(NEAddMulAdd &&) = delete; + /** Destructor */ + ~NEAddMulAdd(); + /** Initialize the function's inputs and outputs. + * + * Valid data layouts: + * - Any + * + * Valid data type configurations: + * |input1 |input2 |bn_mul |bn_add |add_output |final_output | + * |:--------------|:--------------|:--------------|:--------------|:--------------|:--------------| + * |QASYMM8 |QASYMM8 |QASYMM8 |QASYMM8 |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |F16 |F16 |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 |F32 |F32 | + * + * This is what this composite function (tailored for add followed by a batch norm operation) does: + * add_output <- input1 + input2 (add) + * final_output <- add_output * bn_mul + bn_add (batch norm = mul+add) + * + * @param[in] input1 First tensor input. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] input2 Second tensor input. Data types supported: Same as @p input. + * @param[in] bn_mul The multiplication coefficient on the feature dimension. Data types supported: Same as @p input. + * It's one dimensional tensor with size equal to the feature maps [FM] + * @param[in] bn_add The addition coefficient on the feature dimension. Data types supported: Same as @p input. + * It's one dimensional tensor with size equal to the feature maps [FM] + * @param[out] add_output Output of the first add. Data type supported: Same as @p input. + * @param[out] final_output Output of the add+mul+add+act composite operation. Data type supported: Same as @p input. + * @param[in] policy Policy to handle overflow + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * + */ + void configure(ITensor *input1, + ITensor *input2, + ITensor *bn_mul, + ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info); + /** Static function to check if given info will lead to a valid configuration of @ref NEAddMulAdd + * + * Similar to @ref NEAddMulAdd::configure() except the arguments are @ref ITensorInfo * instead of @ref ITensor * + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr<Impl> _impl; +}; +} // namespace arm_compute +#endif /* ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEADDMULADD */ diff --git a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h index 8235185a8e..3bb50a0f90 100644 --- a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h +++ b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,8 +24,6 @@ #ifndef ARM_COMPUTE_NEARGMINMAXLAYER_H #define ARM_COMPUTE_NEARGMINMAXLAYER_H -#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" - #include "arm_compute/core/Types.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/INESimpleFunction.h" @@ -33,11 +31,10 @@ namespace arm_compute { class ITensor; - /** Function to calculate the index of the minimum or maximum values in a * tensor based on an axis. * - * This function calls the following Neon kernels: + * This function calls the following kernels: * * -# @ref NEReductionOperationKernel * -# @ref NEFillBorderKernel @@ -64,6 +61,18 @@ public: ~NEArgMinMaxLayer(); /** Set the input and output tensors. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:-------------| + * |QASYMM8 |U32, S32 | + * |QASYMM8_SIGNED |U32, S32 | + * |S32 |U32, S32, S64 | + * |F16 |U32, S32 | + * |F32 |U32, S32 | + * * @param[in] input Input source tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/S32/F16/F32. * @param[in] axis Axis to find max/min index. * @param[out] output Output source tensor. Data types supported: U32/S32. @@ -74,7 +83,7 @@ public: * * @param[in] input Input source tensor info. Data types supported: QASYMM8_SIGNED/QASYMM8/S32/F16/F32. * @param[in] axis Axis to find max/min index. - * @param[in] output Output source tensor info. Data types supported: U32/S32. + * @param[in] output Output source tensor info. Data types supported: U32/S32/S64. * @param[in] op Operation to perform: min or max * * @return a status @@ -85,7 +94,8 @@ public: void run() override; private: - std::unique_ptr<NEReductionOperation> _reduction_function; + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEARGMINMAXLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h index 8f9fd27906..73a43dbc44 100644 --- a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h +++ b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021 Arm Limited. + * Copyright (c) 2016-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,7 +25,9 @@ #define ARM_COMPUTE_NEARITHMETICADDITION_H #include "arm_compute/core/Types.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/IFunction.h" + #include <memory> namespace arm_compute @@ -51,19 +53,21 @@ public: NEArithmeticAddition &operator=(NEArithmeticAddition &&); /** Initialise the kernel's inputs, output and conversion policy. * - * Valid configurations (Input1,Input2) -> Output : + * Valid data layouts: + * - All * - * - (U8,U8) -> U8 - * - (U8,U8) -> S16 - * - (S16,U8) -> S16 - * - (U8,S16) -> S16 - * - (S16,S16) -> S16 - * - (S32,S32) -> S32 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - (QASYMM8,QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (QSYMM16,QSYMM16) -> QSYMM16 + * Valid data type configurations: + * |src0 |src1 |dst | + * |:--------------|:--------------|:--------------| + * |QASYMM8 |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |QSYMM16 |QSYMM16 |QASYMM16 | + * |QSYMM16 |QSYMM16 |S32 | + * |U8 |U8 |U8 | + * |S16 |S16 |S16 | + * |S32 |S32 |S32 | + * |F16 |F16 |F16 | + * |F32 |F32 |F32 | * * @param[in] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 * @param[in] input2 Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 @@ -71,7 +75,11 @@ public: * @param[in] policy Policy to use to handle overflow. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAddition * * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 @@ -82,7 +90,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h index c741db3223..3e4f6356c5 100644 --- a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h +++ b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021 Arm Limited. + * Copyright (c) 2016-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEARITHMETICSUBTRACTION_H #include "arm_compute/core/Types.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/NEON/INEOperator.h" @@ -57,13 +58,33 @@ public: NEArithmeticSubtraction &operator=(NEArithmeticSubtraction &&); /** Initialise the kernel's inputs, output and conversion policy. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |dst | + * |:--------------|:--------------|:--------------| + * |QASYMM8 |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |QSYMM16 |QSYMM16 |QASYMM16 | + * |QSYMM16 |QSYMM16 |S32 | + * |U8 |U8 |U8 | + * |S16 |S16 |S16 | + * |S32 |S32 |S32 | + * |F16 |F16 |F16 | + * |F32 |F32 |F32 | + * * @param[in] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 * @param[in] input2 Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 * @param[out] output Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtraction * * @param[in] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32 @@ -74,7 +95,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h index 6d56a267a7..99e2dcadbb 100644 --- a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h +++ b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -58,6 +58,16 @@ public: ~NEBatchNormalizationLayer(); /** Set the input and output tensors. * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |F32 |F32 | + * |F16 |F16 | + * * @note If the output tensor is a nullptr or is equal to the input, the batch normalization function will be performed in-place * * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result. @@ -71,7 +81,13 @@ public: * @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. */ - void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta = nullptr, const ITensor *gamma = nullptr, float epsilon = 0.001f, + void configure(ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta = nullptr, + const ITensor *gamma = nullptr, + float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayer * @@ -88,10 +104,14 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr, - float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta = nullptr, + const ITensorInfo *gamma = nullptr, + float epsilon = 0.001f, + ActivationLayerInfo act_info = ActivationLayerInfo()); // Inherited methods overridden: void run() override; @@ -99,5 +119,5 @@ public: private: std::unique_ptr<NEBatchNormalizationLayerKernel> _norm_kernel; /**< Batch normalization layer kernel */ }; -} +} // namespace arm_compute #endif /* ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h index c2fd26d34c..ebed0bea29 100644 --- a/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h +++ b/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NEBATCHTOSPACELAYER_H #define ARM_COMPUTE_NEBATCHTOSPACELAYER_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" namespace arm_compute @@ -52,10 +51,22 @@ public: ~NEBatchToSpaceLayer() = default; /** Set the input and output tensors. * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |dst | + * |:---------|:---------|:----------| + * |All |s32 |All | + * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32 * @param[out] output Tensor output. Data types supported: same as @p input + * + * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release */ + ARM_COMPUTE_DEPRECATED_REL(23.05) void configure(const ITensor *input, const ITensor *block_shape, ITensor *output); /** Set the input and output tensors. (Static block shape). * @@ -63,8 +74,13 @@ public: * @param[in] block_shape_x Block shape x value. * @param[in] block_shape_y Block shape y value. * @param[out] output Tensor output. Data types supported: same as @p input + * @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed */ - void configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output); + void configure(const ITensor *input, + int32_t block_shape_x, + int32_t block_shape_y, + ITensor *output, + const CropInfo &crop_info = CropInfo{}); /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayer * * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All. @@ -72,7 +88,9 @@ public: * @param[out] output Tensor output info. Data types supported: same as @p input * * @return a status + * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release */ + ARM_COMPUTE_DEPRECATED_REL(23.05) static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output); /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayer (Static block shape). * @@ -80,10 +98,15 @@ public: * @param[in] block_shape_x Block shape x value. * @param[in] block_shape_y Block shape y value. * @param[out] output Tensor output info. Data types supported: same as @p input + * @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed * * @return a status */ - static Status validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + int32_t block_shape_x, + int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info = CropInfo{}); }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEBATCHTOSPACELAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h b/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h index 3203d2b9a7..1f95f193d3 100644 --- a/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h +++ b/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -48,6 +48,14 @@ public: ~NEBitwiseAnd() = default; /** Initialise the kernel's inputs and output * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |U8 |U8 | + * * @param[in] input1 First tensor input. Data type supported: U8. * @param[in] input2 Second tensor input. Data type supported: U8. * @param[out] output Output tensor. Data type supported: U8. diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseNot.h b/arm_compute/runtime/NEON/functions/NEBitwiseNot.h index 9fa0d38caf..c66bebf7cc 100644 --- a/arm_compute/runtime/NEON/functions/NEBitwiseNot.h +++ b/arm_compute/runtime/NEON/functions/NEBitwiseNot.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -36,6 +36,14 @@ class NEBitwiseNot : public INESimpleFunctionNoBorder public: /** Initialise the kernel's input and output * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |U8 |U8 | + * * @param[in] input Input tensor. Data type supported: U8. * @param[out] output Output tensor. Data type supported: U8. */ diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseOr.h b/arm_compute/runtime/NEON/functions/NEBitwiseOr.h index fba6b784de..183df212e4 100644 --- a/arm_compute/runtime/NEON/functions/NEBitwiseOr.h +++ b/arm_compute/runtime/NEON/functions/NEBitwiseOr.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -36,6 +36,14 @@ class NEBitwiseOr : public INESimpleFunctionNoBorder public: /** Initialise the kernel's inputs and output * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |U8 |U8 | + * * @param[in] input1 First tensor input. Data type supported: U8. * @param[in] input2 Second tensor input. Data type supported: U8. * @param[out] output Output tensor. Data type supported: U8. diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseXor.h b/arm_compute/runtime/NEON/functions/NEBitwiseXor.h index c6cb584284..126aaa6ddd 100644 --- a/arm_compute/runtime/NEON/functions/NEBitwiseXor.h +++ b/arm_compute/runtime/NEON/functions/NEBitwiseXor.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -36,6 +36,14 @@ class NEBitwiseXor : public INESimpleFunctionNoBorder public: /** Initialise the kernel's inputs and output * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |U8 |U8 | + * * @param[in] input1 First tensor input. Data type supported: U8. * @param[in] input2 Second tensor input. Data type supported: U8. * @param[out] output Output tensor. Data type supported: U8. diff --git a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h index de8dfef4ed..aa41fc0df2 100644 --- a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h +++ b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -32,16 +32,23 @@ namespace arm_compute class ITensor; class ITensorInfo; -/** Basic function to run @ref NEBoundingBoxTransformKernel. - * - * This function calls the following Neon kernels: - * -# @ref NEBoundingBoxTransformKernel - */ +/** Basic function to run @ref NEBoundingBoxTransformKernel. */ class NEBoundingBoxTransform : public INESimpleFunctionNoBorder { public: /** Set the input and output tensors. * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |dst | + * |:--------------|:--------------|:--------------| + * |QASYMM16 |QASYMM8 |QASYMM16 | + * |F16 |F16 |F16 | + * |F32 |F32 |F32 | + * * @param[in] boxes Source tensor. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32. * @param[out] pred_boxes Destination tensor. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input * @param[in] deltas Bounding box translations and scales. Size (M, 4*K), format [dx, dy, dw, dh], K is the number of classes. @@ -50,7 +57,8 @@ public: * * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct. */ - void configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info); + void + configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref NEBoundingBoxTransform * @@ -64,7 +72,10 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info); + static Status validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info); }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEBOUNDINGBOXTRANSFORM_H */ diff --git a/arm_compute/runtime/NEON/functions/NECast.h b/arm_compute/runtime/NEON/functions/NECast.h index e536317660..43cae777f6 100644 --- a/arm_compute/runtime/NEON/functions/NECast.h +++ b/arm_compute/runtime/NEON/functions/NECast.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,33 +25,51 @@ #define ARM_COMPUTE_NECAST_H #include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +#include "arm_compute/runtime/IFunction.h" + +#include <memory> namespace arm_compute { class ITensor; class ITensorInfo; -/** Basic function to run @ref NEDepthConvertLayerKernel. +/** Basic function to run @ref cpu::kernels::CpuCastKernel. * This function ignores the scale and zeroPoint of quanized tensors,so QASYMM8 input is treated as uint8 values. */ -class NECast : public INESimpleFunctionNoBorder +class NECast : public IFunction { public: + /** Constructor */ + NECast(); + /** Destructor */ + ~NECast(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NECast(const NECast &) = delete; + /** Default move constructor */ + NECast(NECast &&); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NECast &operator=(const NECast &) = delete; + /** Default move assignment operator */ + NECast &operator=(NECast &&); /** Initialize the function's source, destination * - * Input data type must be different than output data type. + * Valid data layouts: + * - All * - * Valid conversions Input -> Output : + * Valid data type configurations: + * |src |dst | + * |:--------------|:-----------------------------------------------| + * |QASYMM8_SIGNED | S16, S32, F32, F16 | + * |QASYMM8 | U16, S16, S32, F32, F16 | + * |U8 | U16, S16, S32, F32, F16 | + * |U16 | U8, U32 | + * |S16 | QASYMM8_SIGNED, U8, S32 | + * |F16 | QASYMM8_SIGNED, QASYMM8, F32, S32, U8 | + * |S32 | QASYMM8_SIGNED, QASYMM8, F16, F32, U8 | + * |F32 | QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8| * - * - QASYMM8_SIGNED -> S16, S32, F32, F16 - * - QASYMM8 -> U16, S16, S32, F32, F16 - * - U8 -> U16, S16, S32, F32, F16 - * - U16 -> U8, U32 - * - S16 -> QASYMM8_SIGNED, U8, S32 - * - F16 -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8 - * - S32 -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8 - * - F32 -> QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8 + * Input data type must be different than output data type. * * @param[in] input The input tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/F16/S32/F32. * @param[out] output The output tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/S8/U16/S16/U32/S32/BFLOAT16/F16/F32. @@ -66,7 +84,14 @@ public: * * @return a status */ - static Status validate(ITensorInfo *input, ITensorInfo *output, ConvertPolicy policy); + static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy); + + // Inherited methods overridden + void run() override; + +private: + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute #endif /*ARM_COMPUTE_NECAST_H*/ diff --git a/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h b/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h index aa11396c20..bc19e1a4af 100644 --- a/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h +++ b/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -44,6 +44,15 @@ class NEChannelShuffleLayer : public INESimpleFunctionNoBorder public: /** Initialize the function * + * Valid data layouts: + * - NCHW + * - NHWC + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |All |All | + * * @param[in] input Input tensor. Data types supported: All * @param[out] output Output tensor. Data type supported: Same as @p input * @param[in] num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups. diff --git a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h index 6aa724ab0c..1600f85488 100644 --- a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h +++ b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NECONCATENATELAYER_H #define ARM_COMPUTE_NECONCATENATELAYER_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" #include <memory> @@ -55,6 +54,17 @@ public: NEConcatenateLayer &operator=(NEConcatenateLayer &&); /** Initialise the kernel's inputs vector and output. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |F16 |F16 | + * |F32 |F32 | + * * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis. * @note Preconditions can be found respectively at @ref cpu::kernels::CpuConcatenateWidthKernel, @ref cpu::kernels::CpuConcatenateHeightKernel, * @ref cpu::kernels::CpuConcatenateDepthKernel and @ref cpu::kernels::CpuConcatenateBatchKernel. @@ -76,7 +86,8 @@ public: * * @return a status */ - static Status validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis); + static Status + validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEConv3D.h b/arm_compute/runtime/NEON/functions/NEConv3D.h new file mode 100644 index 0000000000..525f37f3e7 --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEConv3D.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_NECONV3D_H +#define ARM_COMPUTE_NECONV3D_H + +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/FunctionDescriptors.h" +#include "arm_compute/runtime/IFunction.h" + +#include <memory> + +namespace arm_compute +{ +// Forward declarations +class ITensor; + +/** Basic function to simulate a 3d convolution. This function calls one of the following functions: + * -# @ref cpu::CpuDirectConv3d + * + */ +class NEConv3D : public IFunction +{ +public: + /** Constructor */ + NEConv3D(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEConv3D(const NEConv3D &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEConv3D &operator=(const NEConv3D &) = delete; + /** Default move constructor */ + NEConv3D(NEConv3D &&) = default; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + NEConv3D &operator=(NEConv3D &&) = default; + /** Default destructor */ + ~NEConv3D(); + /** Set the input and output tensors. + * + * Valid data layouts: + * - NDHWC + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * + * @param[in] input Source tensor. 4 lower dimensions represent a single input [IFM, width, height, depth], + * while every optional dimension from 5 and above represent a batch of inputs. + * @param[in] weights Weights tensor. Weights are 5D tensor with dimensions [OFM, IFM, kernel_x, kernel_y, kernel_z]. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * @param[out] output Destination tensor. 4 lower dimensions represent a single output [OFM, width, height, depth], while the rest represent batch of outputs. + * @param[in] conv_info Contains padding, stride, acitvation information described in @ref Conv3dInfo. + */ + void configure( + ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to NEConv3D::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const Conv3dInfo &conv_info); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr<Impl> _impl; +}; +} // namespace arm_compute +#endif /* ARM_COMPUTE_NECONV3D_H */ diff --git a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h index 984e8d68c0..dc6b22d717 100644 --- a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h +++ b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,19 +24,16 @@ #ifndef ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H #define ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H +#include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/runtime/ITransformWeights.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/Tensor.h" -#include <memory> namespace arm_compute { // Forward declarations class ITensor; -class NEConvertFullyConnectedWeightsKernel; +class ITensorInfo; -/** Basic function to run @ref NEConvertFullyConnectedWeightsKernel. */ +/** Basic function to run @ref cpu::kernels::CpuConvertFullyConnectedWeightsKernel. */ class NEConvertFullyConnectedWeights : public IFunction { public: @@ -54,12 +51,22 @@ public: ~NEConvertFullyConnectedWeights(); /** Initialize the function. * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |All |All | + * * @param[in] input Source weights tensor to convert. Must be 2 dimensional. Data types supported: All. * @param[out] output The converted weights tensor. Shape and Data Type: Same as @p input. * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer). * @param[in] data_layout The data layout the weights have been trained in. */ - void configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, DataLayout data_layout); + void + configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, DataLayout data_layout); /** Static function to check if given info will lead to a valid configuration of @ref NEConvertFullyConnectedWeights * * @param[in] input Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All. @@ -69,53 +76,17 @@ public: * * @return A Status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, DataLayout data_layout); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const TensorShape &original_input_shape, + DataLayout data_layout); // Inherited methods overriden: void run() override; private: - std::unique_ptr<NEConvertFullyConnectedWeightsKernel> _kernel; -}; - -namespace weights_transformations -{ -/** Basic function to run @ref NEConvertFullyConnectedWeightsKernel. */ -class NEConvertFullyConnectedWeightsManaged : public ITransformWeights -{ -public: - void run() override - { - _output.allocator()->allocate(); - _func.run(); - _reshape_run = true; - } - - void release() override - { - _output.allocator()->free(); - } - - ITensor *get_weights() override - { - return &_output; - } - - uint32_t uid() override - { - return _uid; - } - - void configure(const ITensor *input, const TensorShape &original_input_shape, DataLayout data_layout) - { - _func.configure(input, &_output, original_input_shape, data_layout); - } - -private: - static constexpr uint32_t _uid = 0x4; - Tensor _output{}; - NEConvertFullyConnectedWeights _func{}; + struct Impl; + std::unique_ptr<Impl> _impl; }; -} // namespace weights_transformations } // namespace arm_compute #endif /* ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H */ diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h index a387255b6c..2d07980ade 100644 --- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,13 +21,13 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NECONVOLUTIONLAYER_H -#define ARM_COMPUTE_NECONVOLUTIONLAYER_H - -#include "arm_compute/runtime/IFunction.h" +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NECONVOLUTIONLAYER_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NECONVOLUTIONLAYER_H #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/Types.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" +#include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/MemoryGroup.h" #include <memory> @@ -37,10 +37,10 @@ namespace arm_compute // Forward declarations class ITensor; -/** Basic function to simulate a convolution layer. This function calls one of the following Neon functions: - * -# @ref NEGEMMConvolutionLayer (executed only in case GEMM is required for the operation) - * -# @ref NEWinogradConvolutionLayer (executed only in case Winograd is required for the operation) - * -# @ref NEDirectConvolutionLayer (executed only in case Direct Convolution is required for the operation) +/** Basic function to simulate a convolution layer. This function calls one of the following functions: + * -# @ref cpu::CpuGemmConv2d (executed only in case GEMM is required for the operation) + * -# @ref cpu::CpuWinogradConv2d (executed only in case Winograd is required for the operation) + * -# @ref cpu::CpuDirectConv2d (executed only in case Direct Convolution is required for the operation) * -# @ref NEFFTConvolutionLayer (executed only in case FFT is required for the operation) * * @@ -78,46 +78,70 @@ public: NEConvolutionLayer(const NEConvolutionLayer &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ NEConvolutionLayer &operator=(const NEConvolutionLayer &) = delete; + /** Default move constructor */ + NEConvolutionLayer(NEConvolutionLayer &&) = default; /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEConvolutionLayer(NEConvolutionLayer &&) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEConvolutionLayer &operator=(NEConvolutionLayer &&) = delete; + NEConvolutionLayer &operator=(NEConvolutionLayer &&) = default; /** Default destructor */ - ~NEConvolutionLayer() = default; + ~NEConvolutionLayer(); /** Set the input and output tensors. * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | + * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * Data type supported: Same as @p input, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input. + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation * available which may introduce a drop of accuracy as well. Default is false * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported */ - void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(), - const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1); + void configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); /** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayer * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported:Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * Data type supported: Same as @p input, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. * @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input. + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation @@ -126,20 +150,28 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, - unsigned int num_groups = 1); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); /** Static function to check if given info will return the convolution called by @ref NEConvolutionLayer * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported:Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED. * @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input. + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation @@ -147,15 +179,21 @@ public: * * @return the Convolution Method Hint */ - static ConvolutionMethod get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); + static ConvolutionMethod get_convolution_method(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); // Inherited methods overridden: void run() override; void prepare() override; private: - std::shared_ptr<IMemoryManager> _memory_manager; - std::unique_ptr<IFunction> _function; /**< Function to run */ + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute -#endif /* ARM_COMPUTE_NECONVOLUTIONLAYER_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NECONVOLUTIONLAYER_H diff --git a/arm_compute/runtime/NEON/functions/NECopy.h b/arm_compute/runtime/NEON/functions/NECopy.h index 56f8bd9803..840c03e968 100644 --- a/arm_compute/runtime/NEON/functions/NECopy.h +++ b/arm_compute/runtime/NEON/functions/NECopy.h @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NECOPY_H #define ARM_COMPUTE_NECOPY_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" #include <memory> @@ -53,6 +52,14 @@ public: NECopy &operator=(NECopy &&); /** Initialise the function's source and destination. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |All |All | + * * @param[in] input Source tensor. Data types supported: All * @param[out] output Output tensor. Data types supported: Same as @p input. * diff --git a/arm_compute/runtime/NEON/functions/NECropResize.h b/arm_compute/runtime/NEON/functions/NECropResize.h index 5c3733f8ee..f806762158 100644 --- a/arm_compute/runtime/NEON/functions/NECropResize.h +++ b/arm_compute/runtime/NEON/functions/NECropResize.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -31,6 +31,7 @@ namespace arm_compute { // Forward Declarations +class Tensor; class ITensor; class NECropKernel; @@ -53,6 +54,14 @@ public: /** Configure kernel * + * Valid data layouts: + * - NHWC + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------|:--------|:------|:--------| + * |All |F32 |F32 |F32 | + * * @note Supported tensor rank: up to 4 * @note Box indices may be outside of the bounds, in which case @p extrapolation_value is used. * @note Start and end indices of boxes are inclusive. @@ -66,8 +75,13 @@ public: * @param[in] method The policy to be used when resizing image. Default is bilinear. * @param[in] extrapolation_value Value to be used for values outside of the image for cropping and resizing. Default is 0. */ - void configure(const ITensor *input, const ITensor *boxes, const ITensor *box_ind, ITensor *output, Coordinates2D crop_size, - InterpolationPolicy method = InterpolationPolicy::BILINEAR, float extrapolation_value = 0); + void configure(const ITensor *input, + const ITensor *boxes, + const ITensor *box_ind, + ITensor *output, + Coordinates2D crop_size, + InterpolationPolicy method = InterpolationPolicy::BILINEAR, + float extrapolation_value = 0); /** Static function to check if given info will lead to a valid configuration of @ref NESlice * @@ -87,8 +101,13 @@ public: * * @return A status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *boxes, const ITensorInfo *box_ind, const ITensorInfo *output, - Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value); + static Status validate(const ITensorInfo *input, + const ITensorInfo *boxes, + const ITensorInfo *box_ind, + const ITensorInfo *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value); void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h index 02a0f784ec..aabe42f928 100644 --- a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,15 +24,14 @@ #ifndef ARM_COMPUTE_NEDECONVOLUTIONLAYER_H #define ARM_COMPUTE_NEDECONVOLUTIONLAYER_H -#include "arm_compute/runtime/CPP/functions/CPPUpsample.h" -#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h" -#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h" -#include "arm_compute/runtime/NEON/functions/NEReverse.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CPP/functions/CPPUpsample.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEReverse.h" #include "arm_compute/runtime/Tensor.h" #include <memory> @@ -64,11 +63,10 @@ namespace arm_compute * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution. Therefore, it will be necessary to use the weights in the * reverse order to perform an actual convolution. This is achieved by using @ref NEReverse. * - * This function calls the following Neon kernels/functions: + * This function calls the following kernels/functions: * * -# @ref CPPUpsample * -# @ref NEConvolutionLayer - * -# @ref NEPermute * -# @ref NEReverse * */ @@ -77,39 +75,77 @@ class NEDeconvolutionLayer : public IFunction public: /** Constructor */ NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); - /** Prevent instances of this class from being copied (As this class contains pointers) */ NEDeconvolutionLayer(const NEDeconvolutionLayer &) = delete; + /** Default move constructor */ + NEDeconvolutionLayer(NEDeconvolutionLayer &&) = default; /** Prevent instances of this class from being copied (As this class contains pointers) */ NEDeconvolutionLayer &operator=(const NEDeconvolutionLayer &) = delete; - /** Prevent instances of this class from being moved (As this class contains pointers) */ - NEDeconvolutionLayer(NEDeconvolutionLayer &&) = delete; - /** Prevent instances of this class from being moved (As this class contains pointers) */ - NEDeconvolutionLayer &operator=(NEDeconvolutionLayer &&) = delete; + /** Default move assignment operator */ + NEDeconvolutionLayer &operator=(NEDeconvolutionLayer &&) = default; /** Default destructor */ - virtual ~NEDeconvolutionLayer() = default; + ~NEDeconvolutionLayer() = default; /** Set the input, weights, biases and output tensors. * - * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. - * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input. - * @param[in] bias Optional, ignored if NULL. The biases have one dimension. Data type supported: Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input. - * @param[out] output Output tensor. The output has the same number of dimensions as the @p input. - * @param[in] info Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo. + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | + * + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. + * Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED. + * @param[in] bias Optional, ignored if NULL. The biases have one dimension. + * Data type supported: Data types supported: S32 for QASYMM8/QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input. + * @param[out] output Output tensor. The output has the same number of dimensions as the @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + * @param[in] weights_info (Optional) Specifies the weight format. Default is unspecified. This parameter can be used to specify the weight format that is optimal for + * the GEMM convolution. * */ - void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info); + void configure(ITensor *input, + const ITensor *weights, + const ITensor *bias, + ITensor *output, + const PadStrideInfo &info, + bool enable_fast_math = false, + const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEDeconvolutionLayer * - * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. - * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input. - * @param[in] bias (Optional) The biases have one dimension. Data type supported: Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input. - * @param[in] output Output tensor info. The output has the same number of dimensions as the @p input. - * @param[in] info Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo. + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. + * Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED. + * @param[in] bias (Optional) The biases have one dimension. Data type supported: Data types supported: S32 for QASYMM8/QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input. + * @param[in] output Output tensor info. The output has the same number of dimensions as the @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + * @param[in] weights_info (Optional) Specifies the weight format. Default is unspecified. This parameter can be used to specify the weight format that is optimal for + * the GEMM convolution. * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *output, + const PadStrideInfo &info, + bool enable_fast_math = false, + const WeightsInfo &weights_info = WeightsInfo()); // Inherited methods overridden: void run() override; @@ -127,6 +163,7 @@ private: ITensor *_input; PadStrideInfo _info; bool _is_prepared; + bool _do_upsampling; }; -} // arm_compute +} // namespace arm_compute #endif /* ARM_COMPUTE_NEDECONVOLUTIONLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h index c9817a63c1..7bfdfbd13d 100644 --- a/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,38 +25,48 @@ #define ARM_COMPUTE_NEDEPTHCONVERT_H #include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +#include "arm_compute/runtime/IFunction.h" -#include <cstdint> +#include <memory> namespace arm_compute { class ITensor; class ITensorInfo; -/**Basic function to run @ref NEDepthConvertLayerKernel */ -class NEDepthConvertLayer : public INESimpleFunctionNoBorder +/**Basic function to run @ref cpu::kernels::CpuCastKernel */ +class NEDepthConvertLayer : public IFunction { public: - /* Contructor */ - NEDepthConvertLayer() = default; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ + /** Constructor */ + NEDepthConvertLayer(); + /** Destructor */ + ~NEDepthConvertLayer(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ NEDepthConvertLayer(const NEDepthConvertLayer &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers)*/ - const NEDepthConvertLayer &operator=(const NEDepthConvertLayer &) = delete; - /** Default destructor */ - ~NEDepthConvertLayer() = default; + /** Default move constructor */ + NEDepthConvertLayer(NEDepthConvertLayer &&); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthConvertLayer &operator=(const NEDepthConvertLayer &) = delete; + /** Default move assignment operator */ + NEDepthConvertLayer &operator=(NEDepthConvertLayer &&); /** Initialize the function's source, destination * - * Valid conversions Input -> Output : + * Valid data layouts: + * - All * - * - QASYMM8 -> F16, F32 - * - U8 -> U16, S16, S32 - * - U16 -> U8, U32 - * - S16 -> U8, S32 - * - BFLOAT16 -> F32 - * - F16 -> QASYMM8, F32 - * - F32 -> QASYMM8, F16, BFLOAT16 + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------------------| + * |QASYMM8 | F16, F32 | + * |U8 | U16, S16, S32 | + * |U16 | U8, U32 | + * |S16 | U8, S32 | + * |BFLOAT16 | F32 | + * |F16 | QASYMM8, F32 | + * |F32 | QASYMM8, F16, BFLOAT16 | + * + * Input data type must be different than output data type. * * @param[in] input The input tensor to convert. Data types supported: QASYMM8/U8/U16/S16/BFLOAT16/F16/F32. * @param[out] output The output tensor. Data types supported: QASYMM8/U8/U16/S16/U32/S32/BFLOAT16/F16/F32. @@ -73,7 +83,15 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift = 0); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift = 0); + + // Inherited methods overridden + void run() override; + +private: + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute #endif /*ARM_COMPUTE_NEDEPTHCONVERT_H*/ diff --git a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h index 51f7ff7770..d27369670e 100644 --- a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,26 +21,27 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NEDEPTHTOSPACELAYER_H -#define ARM_COMPUTE_NEDEPTHTOSPACELAYER_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H +#include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +#include <memory> namespace arm_compute { // Forward declarations class ITensor; class ITensorInfo; +class NEDepthToSpaceLayerKernel; /** Basic function to run @ref NEDepthToSpaceLayerKernel. */ -class NEDepthToSpaceLayer : public INESimpleFunctionNoBorder +class NEDepthToSpaceLayer : public IFunction { public: /** Constructor */ - NEDepthToSpaceLayer() = default; + NEDepthToSpaceLayer(); /** Prevent instances of this class from being copied (As this class contains pointers) */ NEDepthToSpaceLayer(const NEDepthToSpaceLayer &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ @@ -50,9 +51,18 @@ public: /** Prevent instances of this class from being moved (As this class contains non movable objects) */ NEDepthToSpaceLayer &operator=(NEDepthToSpaceLayer &&) = delete; /** Default destructor */ - ~NEDepthToSpaceLayer() = default; + ~NEDepthToSpaceLayer(); /** Set the input and output tensors. * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |All |All | + * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All * @param[out] output Tensor output. Data types supported: same as @p input * @param[in] block_shape Block shape value. @@ -67,6 +77,11 @@ public: * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape); + + void run() override; + +private: + std::unique_ptr<NEDepthToSpaceLayerKernel> _kernel; }; } // namespace arm_compute -#endif /* ARM_COMPUTE_NEDEPTHTOSPACELAYER_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h index 98fffe0b33..6ad5aa7bfa 100644 --- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h @@ -24,9 +24,11 @@ #ifndef ARM_COMPUTE_NEDEPTHWISECONVOLUTION_H #define ARM_COMPUTE_NEDEPTHWISECONVOLUTION_H +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/NEON/functions/NEPermute.h" -#include "arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h" + #include <memory> namespace arm_compute @@ -54,6 +56,20 @@ public: ~NEDepthwiseConvolutionLayer(); /** Initialize the function's source, destination, weights and convolution information. * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | + * * @param[in, out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32 * @param[out] output Destination tensor. Data type supported: same as @p input. * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. @@ -65,8 +81,14 @@ public: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). */ - void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); + void configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + const Size2D &dilation = Size2D(1U, 1U)); /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer * @@ -83,40 +105,27 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + const Size2D &dilation = Size2D(1U, 1U)); // Inherited methods overriden: void run() override; void prepare() override; private: - /** Static function to choose the best depthwise convolution function for @ref NEDepthwiseConvolutionLayer - * - * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32 - * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. - * Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED. - * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED. - * @param[in] output Destination tensor. Data type supported: same as @p input. - * @param[in] conv_info Padding and stride information to use for the convolution. - * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for 3x3 quantized are supported. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - * - * @return a Depthwise Convolution Function - */ - static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, - ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); - - /** Basic function to execute optimized depthwise convolution routines. This function calls the following Neon kernels: + /** Basic function to execute optimized depthwise convolution routines. This function calls the following kernels: * * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported * * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present * -# @ref NEDepthwiseConvolutionLayer3x3Kernel if 3x3 and no assembly kernel implementation is present - * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present + * -# @ref cpu::CpuDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present * -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of output is required * -# @ref NEActivationLayer if fused activation is required * @@ -131,9 +140,11 @@ private: /** Default move constructor */ NEDepthwiseConvolutionLayerOptimizedInternal(NEDepthwiseConvolutionLayerOptimizedInternal &&) = default; /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEDepthwiseConvolutionLayerOptimizedInternal &operator=(const NEDepthwiseConvolutionLayerOptimizedInternal &) = delete; + NEDepthwiseConvolutionLayerOptimizedInternal & + operator=(const NEDepthwiseConvolutionLayerOptimizedInternal &) = delete; /** Default move assignment operator */ - NEDepthwiseConvolutionLayerOptimizedInternal &operator=(NEDepthwiseConvolutionLayerOptimizedInternal &&) = default; + NEDepthwiseConvolutionLayerOptimizedInternal & + operator=(NEDepthwiseConvolutionLayerOptimizedInternal &&) = default; /** Default destructor */ ~NEDepthwiseConvolutionLayerOptimizedInternal() = default; /** Initialize the function's source, destination, kernels and border_size. @@ -148,8 +159,14 @@ private: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). */ - void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); + void configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + const Size2D &dilation = Size2D(1U, 1U)); /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer3x3 * @@ -165,34 +182,26 @@ private: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + const Size2D &dilation = Size2D(1U, 1U)); // Inherited methods overriden: void run() override; void prepare() override; private: - MemoryGroup _memory_group; - NEDepthwiseConvolutionAssemblyDispatch _dwc_optimized_func; - NEPermute _permute_input; - NEPermute _permute_weights; - NEPermute _permute_output; - NEActivationLayer _activationlayer_function; - Tensor _accumulator; - Tensor _permuted_input; - Tensor _permuted_weights; - Tensor _permuted_output; - const ITensor *_original_weights; - bool _has_bias; - bool _is_quantized; - bool _is_nchw; - bool _permute; - bool _is_activationlayer_enabled; - bool _is_prepared; + MemoryGroup _memory_group; + struct Impl; + std::unique_ptr<Impl> _impl; }; - /** Basic function to execute a generic depthwise convolution. This function calls the following Neon kernel: + /** Basic function to execute a generic depthwise convolution. This function calls the following kernel: * * -# @ref NEDepthwiseConvolutionLayerNativeKernel * @@ -225,8 +234,14 @@ private: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). */ - void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); + void configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + const Size2D &dilation = Size2D(1U, 1U)); /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayerGeneric * @@ -243,31 +258,25 @@ private: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + const Size2D &dilation = Size2D(1U, 1U)); // Inherited methods overriden: void run() override; - void prepare() override; private: - std::unique_ptr<NEDepthwiseConvolutionLayerNativeKernel> _depthwise_conv_kernel; - NEPermute _permute_input; - NEPermute _permute_weights; - NEPermute _permute_output; - NEActivationLayer _activationlayer_function; - Tensor _permuted_input; - Tensor _permuted_weights; - Tensor _permuted_output; - bool _is_prepared; - bool _is_nchw; - bool _is_activationlayer_enabled; - const ITensor *_original_weights; + struct Impl; + std::unique_ptr<Impl> _impl; }; - - DepthwiseConvolutionFunction _depth_conv_func; - NEDepthwiseConvolutionLayerOptimizedInternal _func_optimized; - NEDepthwiseConvolutionLayerGeneric _func_generic; + MemoryGroup _memory_group; + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEDEPTHWISECONVOLUTION_H */ diff --git a/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h b/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h index f52d709c74..8b49930ef5 100644 --- a/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,9 +24,10 @@ #ifndef ARM_COMPUTE_NEDEQUANTIZATIONLAYER_H #define ARM_COMPUTE_NEDEQUANTIZATIONLAYER_H -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" + +#include <memory> namespace arm_compute { @@ -34,12 +35,36 @@ namespace arm_compute class ITensor; class ITensorInfo; -/** Basic function to run @ref NEDequantizationLayerKernel that dequantizes an input tensor */ -class NEDequantizationLayer : public INESimpleFunctionNoBorder +/** Basic function to run @ref cpu::CpuDequantize that dequantizes an input tensor */ +class NEDequantizationLayer : public IFunction { public: + /** Default Constructor */ + NEDequantizationLayer(); + /** Default Destructor */ + ~NEDequantizationLayer(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDequantizationLayer(const NEDequantizationLayer &) = delete; + /** Default move constructor */ + NEDequantizationLayer(NEDequantizationLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDequantizationLayer &operator=(const NEDequantizationLayer &) = delete; + /** Default move assignment operator */ + NEDequantizationLayer &operator=(NEDequantizationLayer &&) = default; /** Configure the kernel. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:------------------|:-----------| + * |QASYMM8 |F16, F32 | + * |QASYMM8_SIGNED |F16, F32 | + * |QSYMM8_PER_CHANNEL |F16, F32 | + * |QSYMM8 |F16, F32 | + * |QSYMM16 |F16, F32 | + * * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16. * @param[out] output Destination tensor with the same dimensions of input. Data type supported: F16/F32. */ @@ -52,6 +77,13 @@ public: * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *output); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEDEQUANTIZATIONLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h index d5c1f0ab6f..7a94833d10 100644 --- a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,13 +24,12 @@ #ifndef ARM_COMPUTE_NE_DETECTION_POSTPROCESS_H #define ARM_COMPUTE_NE_DETECTION_POSTPROCESS_H -#include "arm_compute/runtime/NEON/INESimpleFunction.h" - #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h" +#include "arm_compute/runtime/NEON/INESimpleFunction.h" #include "arm_compute/runtime/Tensor.h" #include <map> @@ -57,6 +56,16 @@ public: ~NEDetectionPostProcessLayer() = default; /** Configure the detection output layer NE function * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 - src2 |dst0 - dst3 | + * |:--------------|:--------------| + * |QASYMM8 |F32 | + * |QASYMM8_SIGNED |F32 | + * |F32 |F32 | + * * @param[in] input_box_encoding The bounding box input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F32. * @param[in] input_score The class prediction input tensor. Data types supported: same as @p input_box_encoding. * @param[in] input_anchors The anchors input tensor. Data types supported: same as @p input_box_encoding. @@ -68,8 +77,14 @@ public: * * @note Output contains all the detections. Of those, only the ones selected by the valid region are valid. */ - void configure(const ITensor *input_box_encoding, const ITensor *input_score, const ITensor *input_anchors, - ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo()); + void configure(const ITensor *input_box_encoding, + const ITensor *input_score, + const ITensor *input_anchors, + ITensor *output_boxes, + ITensor *output_classes, + ITensor *output_scores, + ITensor *num_detection, + DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEDetectionPostProcessLayer * * @param[in] input_box_encoding The bounding box input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32. @@ -83,8 +98,13 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors, - ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, + static Status validate(const ITensorInfo *input_box_encoding, + const ITensorInfo *input_class_score, + const ITensorInfo *input_anchors, + ITensorInfo *output_boxes, + ITensorInfo *output_classes, + ITensorInfo *output_scores, + ITensorInfo *num_detection, DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo()); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h index ff0c3054d8..3ae3b2a15c 100644 --- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,27 +25,22 @@ #define ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H #include "arm_compute/core/Types.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" -#include "arm_compute/runtime/Tensor.h" #include <memory> namespace arm_compute { -class NEDirectConvolutionLayerOutputStageKernel; -class NEDirectConvolutionLayerKernel; -class NEFillBorderKernel; - +class ITensor; +class ITensorInfo; /** Function to run the direct convolution. * - * This function calls the following Neon kernels: + * This function calls the following: * - * -# @ref NEFillBorderKernel for the input - * -# @ref NEDirectConvolutionLayerOutputStageKernel - * -# @ref NEDirectConvolutionLayerKernel + * -# @ref cpu::CpuDirectConv2d */ class NEDirectConvolutionLayer : public IFunction { @@ -64,6 +59,16 @@ public: ~NEDirectConvolutionLayer(); /** Set the input, weights, biases and output tensors. * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:------|:------|:------|:------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * * @note: DirectConvolution only works in the following configurations: * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32 * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32 @@ -80,7 +85,12 @@ public: * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensor *input, + const ITensor *weights, + const ITensor *bias, + ITensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayer * * @note: DirectConvolution only works in the following configurations: @@ -101,23 +111,20 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info, + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *output, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run() override; private: - MemoryGroup _memory_group; - std::unique_ptr<NEDirectConvolutionLayerOutputStageKernel> _output_stage_kernel; - std::unique_ptr<NEDirectConvolutionLayerKernel> _conv_kernel; - std::unique_ptr<NEFillBorderKernel> _input_border_handler; - NEActivationLayer _activationlayer_function; - Tensor _accumulator; - bool _has_bias; - bool _is_activationlayer_enabled; - unsigned int _dim_split; - bool _is_padding_required; + struct Impl; + std::shared_ptr<IMemoryManager> _memory_manager; + std::unique_ptr<Impl> _impl; }; -} +} // namespace arm_compute #endif /* ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h index 44b70bbe85..ebf2277d1f 100644 --- a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h +++ b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEELEMENTWISEOPERATIONS_H #include "arm_compute/core/Types.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/NEON/INEOperator.h" @@ -54,12 +55,28 @@ public: NEElementwiseMax &operator=(NEElementwiseMax &&); /** Initialise the kernel's inputs, output and conversion policy. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |dst | + * |:--------------|:--------------|:--------------| + * |QASYMM8 |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |S32 |S32 |S32 | + * |S16 |S16 |S16 | + * |F16 |F16 |F16 | + * |F32 |F32 |F32 | + * * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1. * @param[out] output Output tensor. Data types supported: Same as @p input1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for max * * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. @@ -69,7 +86,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run() override; @@ -101,12 +121,28 @@ public: NEElementwiseMin &operator=(NEElementwiseMin &&); /** Initialise the kernel's inputs, output and conversion policy. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |dst | + * |:--------------|:--------------|:--------------| + * |QASYMM8 |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |S32 |S32 |S32 | + * |S16 |S16 |S16 | + * |F16 |F16 |F16 | + * |F32 |F32 |F32 | + * * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1. * @param[out] output Output tensor. Data types supported: Same as @p input1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for min * * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. @@ -116,7 +152,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run() override; @@ -148,12 +187,28 @@ public: NEElementwiseSquaredDiff &operator=(NEElementwiseSquaredDiff &&); /** Initialise the kernel's inputs, output and conversion policy. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |dst | + * |:--------------|:--------------|:--------------| + * |QASYMM8 |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |S32 |S32 |S32 | + * |S16 |S16 |S16 | + * |F16 |F16 |F16 | + * |F32 |F32 |F32 | + * * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1. * @param[out] output Output tensor. Data types supported: Same as @p input1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for squared difference * * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. @@ -163,7 +218,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run() override; @@ -195,12 +253,24 @@ public: NEElementwiseDivision &operator=(NEElementwiseDivision &&); /** Initialise the kernel's inputs, output and conversion policy. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |dst | + * |:--------------|:--------------|:--------------| + * |F16 |F16 |F16 | + * |F32 |F32 |F32 | + * * @param[in, out] input1 First tensor input. Data types supported: F16/F32. * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1. * @param[out] output Output tensor. Data types supported: Same as @p input1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for division * * @param[in] input1 First tensor input info. Data types supported: F16/F32. @@ -210,7 +280,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run() override; @@ -243,12 +316,24 @@ public: NEElementwisePower &operator=(NEElementwisePower &&); /** Initialise the kernel's inputs, output and conversion policy. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |dst | + * |:--------------|:--------------|:--------------| + * |F16 |F16 |F16 | + * |F32 |F32 |F32 | + * * @param[in, out] input1 First tensor input. Data types supported: F16/F32. * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1. * @param[out] output Output tensor. Data types supported: Same as @p input1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for power * * @param[in] input1 First tensor input info. Data types supported: F16/F32. @@ -258,7 +343,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run() override; @@ -290,6 +378,20 @@ public: NEElementwiseComparison &operator=(NEElementwiseComparison &&); /** Initialise the kernel's inputs, output and conversion policy. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |dst | + * |:--------------|:--------------|:-----| + * |QASYMM8 |QASYMM8 |U8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |U8 | + * |S32 |S32 |U8 | + * |U8 |U8 |U8 | + * |S16 |S16 |U8 | + * |F16 |F16 |U8 | + * |F32 |F32 |U8 | + * * @param[in, out] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1. * @param[out] output Output tensor. Data types supported: U8. @@ -305,7 +407,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op); + static Status + validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h b/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h index 4786f71cf8..63e47b8377 100644 --- a/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h +++ b/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h @@ -54,6 +54,16 @@ public: /** Initialize the function * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |F16 |F16 | + * |F32 |F32 | + * |S32 |S32 | + * * @param[in] input Input tensor. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations. * @param[out] output Output tensor. Data types supported: Same as @p input. */ diff --git a/arm_compute/runtime/NEON/functions/NEFFT1D.h b/arm_compute/runtime/NEON/functions/NEFFT1D.h index 04e8f81b69..99c6fd4eb4 100644 --- a/arm_compute/runtime/NEON/functions/NEFFT1D.h +++ b/arm_compute/runtime/NEON/functions/NEFFT1D.h @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NEFFT1D_H #define ARM_COMPUTE_NEFFT1D_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/runtime/FunctionDescriptors.h" +#include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" @@ -40,7 +39,7 @@ class NEFFTDigitReverseKernel; class NEFFTRadixStageKernel; class NEFFTScaleKernel; -/** Basic function to execute one dimensional FFT. This function calls the following Neon kernels: +/** Basic function to execute one dimensional FFT. This function calls the following kernels: * * -# @ref NEFFTDigitReverseKernel Performs digit reverse * -# @ref NEFFTRadixStageKernel A list of FFT kernels depending on the radix decomposition @@ -63,6 +62,14 @@ public: ~NEFFT1D(); /** Initialise the function's source and destinations. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:------|:------| + * |F32 |F32 | + * * @param[in] input Source tensor. Data types supported: F32. Number of channels supported: 1 (real tensor) or 2 (complex tensor). * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. * Number of channels supported: 1 (real tensor) or 2 (complex tensor).If @p input is real, @p output must be complex. diff --git a/arm_compute/runtime/NEON/functions/NEFFT2D.h b/arm_compute/runtime/NEON/functions/NEFFT2D.h index 218401b429..cefd3df17a 100644 --- a/arm_compute/runtime/NEON/functions/NEFFT2D.h +++ b/arm_compute/runtime/NEON/functions/NEFFT2D.h @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NEFFT2D_H #define ARM_COMPUTE_NEFFT2D_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/runtime/FunctionDescriptors.h" +#include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEFFT1D.h" #include "arm_compute/runtime/Tensor.h" @@ -36,7 +35,7 @@ namespace arm_compute // Forward declaration class ITensor; -/** Basic function to execute two dimensional FFT. This function calls the following Neon kernels: +/** Basic function to execute two dimensional FFT. This function calls the following kernels: * * -# @ref NEFFT1D 1D FFT is performed on the first given axis * -# @ref NEFFT1D 1D FFT is performed on the second given axis @@ -58,6 +57,14 @@ public: ~NEFFT2D(); /** Initialise the function's source and destinations * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:------|:------| + * |F32 |F32 | + * * @param[in] input Source tensor. Data types supported: F32. * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. * @param[in] config FFT related configuration diff --git a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h index 8967363e7f..84bfe6b02f 100644 --- a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NEFFTCONVOLUTIONLAYER_H #define ARM_COMPUTE_NEFFTCONVOLUTIONLAYER_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/runtime/NEON/functions/NEFFT2D.h" @@ -43,7 +42,7 @@ namespace arm_compute // Forward declarations class ITensor; -/** Basic function to execute FFT-based convolution on Neon. This function calls the following Neon functions/kernels: +/** Basic function to execute FFT-based convolution on CPU. This function calls the following functions/kernels: * * -# @ref NEPermute Permute input if NHWC(only NCHW is supported). * -# @ref NEPadLayer Pad input. @@ -73,6 +72,14 @@ public: ~NEFFTConvolutionLayer(); /** Set the input and output tensors. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:------|:------| + * |F32 |F32 | + * * @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], @@ -84,10 +91,15 @@ public: * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * @param[in] enable_fast_math (Optional) Enable fast math computation. Unused for Neon backend. + * @param[in] enable_fast_math (Optional) Enable fast math computation. Unused for CPU backend. */ - void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); + void configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); /** Static function to check if given info will lead to a valid configuration of @ref NEFFTConvolutionLayer * * @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout @@ -101,12 +113,17 @@ public: * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * @param[in] enable_fast_math (Optional) Enable fast math computation. Unused for Neon backend. + * @param[in] enable_fast_math (Optional) Enable fast math computation. Unused for CPU backend. * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEFill.h b/arm_compute/runtime/NEON/functions/NEFill.h index ba5d020496..1829c71fef 100644 --- a/arm_compute/runtime/NEON/functions/NEFill.h +++ b/arm_compute/runtime/NEON/functions/NEFill.h @@ -24,10 +24,9 @@ #ifndef ARM_COMPUTE_NEFILL_H #define ARM_COMPUTE_NEFILL_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" #include <memory> @@ -53,6 +52,14 @@ public: NEFill &operator=(NEFill &&); /** Initialize the function * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:------|:------| + * |All |All | + * * @param[in,out] tensor Source tensor. Data types supported: All * @param[in] constant_value Constant value to use to fill tensor. */ diff --git a/arm_compute/runtime/NEON/functions/NEFillBorder.h b/arm_compute/runtime/NEON/functions/NEFillBorder.h index 8a8a0c7dc2..44b1d4a62b 100644 --- a/arm_compute/runtime/NEON/functions/NEFillBorder.h +++ b/arm_compute/runtime/NEON/functions/NEFillBorder.h @@ -27,6 +27,7 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" + #include <memory> namespace arm_compute @@ -42,6 +43,14 @@ public: NEFillBorder(); /** Initialize the function's source, destination and border_mode. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |All |All | + * * @note This function fills the borders within the XY-planes. * * @param[in, out] input Source tensor. Data type supported: All @@ -49,7 +58,10 @@ public: * @param[in] border_mode Strategy to use for borders. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ - void configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); + void configure(ITensor *input, + unsigned int border_width, + BorderMode border_mode, + const PixelValue &constant_border_value = PixelValue()); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEFlattenLayer.h b/arm_compute/runtime/NEON/functions/NEFlattenLayer.h index 1104aac77f..3e92143824 100644 --- a/arm_compute/runtime/NEON/functions/NEFlattenLayer.h +++ b/arm_compute/runtime/NEON/functions/NEFlattenLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,7 +26,8 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" + +#include <memory> namespace arm_compute { @@ -37,8 +38,27 @@ class ITensorInfo; class NEFlattenLayer : public IFunction { public: + NEFlattenLayer(); + /** Destructor */ + ~NEFlattenLayer(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFlattenLayer(const NEFlattenLayer &) = delete; + /** Default move constructor */ + NEFlattenLayer(NEFlattenLayer &&); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFlattenLayer &operator=(const NEFlattenLayer &) = delete; + /** Default move assignment operator */ + NEFlattenLayer &operator=(NEFlattenLayer &&); /** Initialise the kernel's input and output. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |All |All | + * * @param[in] input First input tensor to flatten with at least 3 dimensions. The dimensions over the third will be interpreted as batches. Data types supported: All * @param[out] output Output tensor with shape [w*h*d, input_batches] where: * w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input @@ -60,7 +80,8 @@ public: void run() override; private: - NEReshapeLayer _reshape{}; + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute diff --git a/arm_compute/runtime/NEON/functions/NEFloor.h b/arm_compute/runtime/NEON/functions/NEFloor.h index 9560eb9169..77ac484bab 100644 --- a/arm_compute/runtime/NEON/functions/NEFloor.h +++ b/arm_compute/runtime/NEON/functions/NEFloor.h @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NEFLOOR_H #define ARM_COMPUTE_NEFLOOR_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" #include <memory> @@ -54,6 +53,15 @@ public: ~NEFloor(); /** Set the source, destination of the kernel * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:------|:------| + * |F32 |F32 | + * |F16 |F16 | + * * @param[in] input Source tensor. Data type supported: F16/F32. * @param[out] output Destination tensor. Same as @p input */ diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h index ffea02670f..885f8430cf 100644 --- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h +++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,56 +24,20 @@ #ifndef ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H #define ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H +#include "arm_compute/function_info/FullyConnectedLayerInfo.h" #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" - -#include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h" -#include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h" -#include "arm_compute/runtime/NEON/functions/NEGEMM.h" -#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/IWeightsManager.h" +#include "arm_compute/runtime/NEON/functions/NETranspose.h" #include "arm_compute/runtime/Tensor.h" +#include <memory> + namespace arm_compute { -/** Basic function to reshape the weights of Fully Connected layer with Neon. This function calls the following kernels: - * - * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. - */ -class NEFullyConnectedLayerReshapeWeights : public INESimpleFunctionNoBorder -{ -public: - /** Constructor */ - NEFullyConnectedLayerReshapeWeights() = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEFullyConnectedLayerReshapeWeights(const NEFullyConnectedLayerReshapeWeights &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEFullyConnectedLayerReshapeWeights &operator=(const NEFullyConnectedLayerReshapeWeights &) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEFullyConnectedLayerReshapeWeights(NEFullyConnectedLayerReshapeWeights &&) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEFullyConnectedLayerReshapeWeights &operator=(NEFullyConnectedLayerReshapeWeights &&) = delete; - /** Default destructor */ - ~NEFullyConnectedLayerReshapeWeights() = default; - /** Set the input and output tensors. - * - * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[out] output Destination tensor. Data type supported: Same as @p input. - */ - void configure(const ITensor *input, ITensor *output); - /** Static function to check if given info will lead to a valid configuration of @ref NEFullyConnectedLayerReshapeWeights - * - * @param[in] input Weights tensor info. The weights must be 2 dimensional. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] output Destination tensor info. Data type supported: Same as @p input. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output); -}; - namespace weights_transformations { -/** Basic function to manage the reshape weights generated from @ref NEFullyConnectedLayerReshapeWeights */ +/** Basic function to manage the reshape weights generated from @ref NETranspose */ class NEFullyConnectedLayerReshapeWeightsManaged : public ITransformWeights { public: @@ -105,17 +69,17 @@ public: } private: - static constexpr uint32_t _uid = 0x0; - Tensor _output{}; - NEFullyConnectedLayerReshapeWeights _func{}; + static constexpr uint32_t _uid = 0x0; + Tensor _output{}; + NETranspose _func{}; }; } // namespace weights_transformations -/** Basic function to compute a Fully Connected layer on Neon. This function calls the following Neon kernels: - * -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer) - * -# @ref NEFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once) - * -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized asymmetric) - * -# @ref NEGEMMMatrixAdditionKernel or @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is not equal to nullptr) +/** Basic function to compute a Fully Connected layer. This function calls the following kernels: + * -# @ref cpu::kernels::CpuIm2ColKernel (called when the input comes from a convolutional layer) + * -# @ref NETranspose (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once) + * -# @ref NEGEMM or @ref NEGEMMLowpMatrixMultiplyCore (if quantized asymmetric) + * -# @ref cpu::kernels::CpuGemmMatrixAdditionKernel or @ref NEGEMMLowpOutputStage (if quantized asymmetric) (if @p biases is not equal to nullptr) * * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. */ @@ -123,7 +87,8 @@ class NEFullyConnectedLayer : public IFunction { public: /** Constructor */ - NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr); + NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr, + IWeightsManager *weights_manager = nullptr); /** Prevent instances of this class from being copied (As this class contains pointers) */ NEFullyConnectedLayer(const NEFullyConnectedLayer &) = delete; /** Prevent instances of this class from being moved (As this class contains pointers) */ @@ -136,66 +101,77 @@ public: ~NEFullyConnectedLayer(); /** Set the input and output tensors. * - * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] weights Weights tensor. The weights must be 2 dimensional. - * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions. - * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension. - * Data type supported: Same as @p input. - * @param[in] biases Bias tensor. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED. - * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix multiplication between: - * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer - * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer. - * Data type supported: Same as @p input. - * @param[in] fc_info (Optional) Fully connected layer additional info + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * + * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension. + * Data type supported: Same as @p input. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED. + * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + * @param[in] weights_info (Optional) Stores neccessary compute information when weights are already reshaped */ - void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, - FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + void configure(const ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), + const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEFullyConnectedLayer * - * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] weights Weights tensor info. The weights must be 2 dimensional. - * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions. - * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension. - * Data type supported: Same as @p input. - * @param[in] biases Bias tensor. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED. - * @param[in] output Destination tensor info. Its shape should be equal to the output of a matrix multiplication between: - * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer - * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer. - * Data type supported: Same as @p input. - * @param[in] fc_info (Optional) Fully connected layer additional info + * Similar to @ref NEFullyConnectedLayer::configure() * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), + const WeightsInfo &weights_info = WeightsInfo()); + + /** Static function that queries whether fixed-format kernel exists for a given problem description + * + * @param[out] expected_weight_format Format in which weights should be for found fixed format kernel + * @param[in] input Source tensor + * @param[in] weights Weights tensor. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED. + * @param[in] output Destination tensor + * @param[in] fc_info Fully connected layer additional info + * @param[in] weights_info Describes weights shape + * + * @return a status + */ + static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const FullyConnectedLayerInfo &fc_info, + const WeightsInfo &weights_info); //Inherited methods override void run() override; void prepare() override; private: - void configure_fc_fc(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act); - void configure_conv_fc(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act); - void configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act); - - MemoryGroup _memory_group; - IWeightsManager *_weights_manager; - NEFlattenLayer _flatten; - NEConvertFullyConnectedWeights _convert_weights; - weights_transformations::NEConvertFullyConnectedWeightsManaged _convert_weights_managed; - NEFullyConnectedLayerReshapeWeights _reshape_weights_function; - weights_transformations::NEFullyConnectedLayerReshapeWeightsManaged _reshape_weights_managed_function; - NEGEMM _mm_gemm; - NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp; - Tensor _flatten_output; - Tensor _converted_weights_output; - Tensor _reshape_weights_output; - const ITensor *_original_weights; - bool _are_weights_converted; - bool _are_weights_reshaped; - bool _is_fc_after_conv; - bool _is_quantized_asymmetric; - bool _is_prepared; + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h b/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h index 5dc804e240..f53b3de7f6 100644 --- a/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h +++ b/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -52,6 +52,16 @@ public: ~NEFuseBatchNormalization(); /** Set the input and output tensors. * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |F32 |F32 | + * |F16 |F16 | + * * @param[in] input_weights Input weights tensor for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC * @param[in] bn_mean Batch normalization layer mean tensor. Same as @p input_weights * @param[in] bn_var Batch normalization layer variance tensor. Same as @p input_weights @@ -65,9 +75,16 @@ public: * @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f. * @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to Convolution. */ - void configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *input_bias = nullptr, const ITensor *bn_beta = nullptr, const ITensor *bn_gamma = nullptr, - float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); + void configure(const ITensor *input_weights, + const ITensor *bn_mean, + const ITensor *bn_var, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *input_bias = nullptr, + const ITensor *bn_beta = nullptr, + const ITensor *bn_gamma = nullptr, + float epsilon = 0.001f, + FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); /** Static function to check if given info will lead to a valid configuration of @ref NEFuseBatchNormalization * * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC @@ -85,10 +102,16 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr, - float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); + static Status validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias = nullptr, + const ITensorInfo *bn_beta = nullptr, + const ITensorInfo *bn_gamma = nullptr, + float epsilon = 0.001f, + FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h index 6f7951eece..29650a5eca 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMM.h +++ b/arm_compute/runtime/NEON/functions/NEGEMM.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,40 +24,18 @@ #ifndef ARM_COMPUTE_NEGEMM_H #define ARM_COMPUTE_NEGEMM_H +#include "arm_compute/function_info/GEMMInfo.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/IWeightsManager.h" -#include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" -#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" -#include "arm_compute/runtime/Tensor.h" #include <memory> namespace arm_compute { -// Forward declarations -class NEGEMMInterleave4x4Kernel; -class NEGEMMMatrixAdditionKernel; -class NEGEMMMatrixMultiplyKernel; -class NEGEMMTranspose1xWKernel; -class NEGEMMAssemblyDispatch; - -/** Basic function to execute GEMM on Neon. This function calls the following Neon kernels: - * - * If optimized assembly is available: - * -# @ref NEGEMMAssemblyDispatch - * -# @ref NEActivationLayer (if alpha != 1.0) - * Else: - * -# @ref NEGEMMInterleave4x4Kernel (if the output tensor is a matrix) - * -# @ref NEGEMMTranspose1xWKernel (if the output tensor is a matrix) - * -# @ref NEGEMMMatrixMultiplyKernel - * In both cases: - * -# @ref NEGEMMMatrixAdditionKernel (if c != nullptr and beta != 0.0 and is not reshaped once) - * Else: - * -# @ref NEArithmeticAddition (if c != nullptr and is reshaped once and not optimized assembly in place) +/** Basic function to execute GEMM. This function calls the following kernels: * - * -# @ref NEActivationLayer (if activation is specified in GEMMInfo) + * -# @ref cpu::CpuGemm */ class NEGEMM : public IFunction { @@ -76,9 +54,21 @@ public: ~NEGEMM(); /** Initialise the kernel's inputs, output * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:------------|:-----------|:---------|:--------------| + * |F32 |F32 |F32 |F32 | + * |F16 |F16 |F16 |F16 | + * |BFLOAT16 |BFLOAT16 |BFLOAT16 |BFLOAT16 | + * * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C]. * @note GEMM: The tensors a, b, c, d must have the same data type. You should not mix data types when calling this function. * + * @note Batched GEMM only supports broadcasting cases where RHS rank < LHS rank but not the other way around + * * @param[in] a First input tensor (Matrix A or Vector A). Data type supported: BFLOAT16/F16/F32 * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a * @param[in] c Third input tensor (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a @@ -88,49 +78,49 @@ public: * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and * if the reshape of matrix B should happen only for the first run */ - void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo()); + void configure(const ITensor *a, + const ITensor *b, + const ITensor *c, + ITensor *d, + float alpha, + float beta, + const GEMMInfo &gemm_info = GEMMInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMM. * - * @param[in] a First input tensor info (Matrix or Vector A). Data types supported: BFLOAT16/F16/F32 - * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a. - * @param[in] c Third input tensor info (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a. - * @param[out] output Output tensor info. Data type supported: same as @p a - * @param[in] alpha Weight of the matrix product - * @param[in] beta Weight of matrix C - * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and - * if the reshape of matrix B should happen only for the first run + * Similar to @ref NEGEMM::configure() * * @return a status */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo()); + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info = GEMMInfo()); + + /** Static function that queries whether there exists fixed-format kernel and if it exists it will return in the first argument in what format + * weights are expected to be reshaped as defined by WeightFormat class. Apart from the first argument the rest of the arguments are the same + * as in @ref NEGEMM::validate() except that all arguments are required. + * + * @return a status + */ + static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info = GEMMInfo()); // Inherited methods overridden: void run() override; void prepare() override; private: - MemoryGroup _memory_group; - IWeightsManager *_weights_manager; - std::unique_ptr<NEGEMMInterleave4x4Kernel> _interleave_kernel; - std::unique_ptr<NEGEMMTranspose1xWKernel> _transpose_kernel; - std::unique_ptr<NEGEMMMatrixMultiplyKernel> _mm_kernel; - std::unique_ptr<NEGEMMAssemblyDispatch> _asm_glue; - std::unique_ptr<NEGEMMMatrixAdditionKernel> _ma_kernel; - NEActivationLayer _alpha_scale_func; - NEArithmeticAddition _add_bias; - NEActivationLayer _activation_func; - - Tensor _tmp_a; - Tensor _tmp_b; - Tensor _tmp_d; - const ITensor *_original_b; - bool _run_vector_matrix_multiplication; - bool _run_alpha_scale; - bool _run_addition; - bool _run_bias_addition; - bool _run_activation; - bool _reshape_b_only_on_first_run; - bool _is_prepared; + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute #endif /*ARM_COMPUTE_NEGEMM_H */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h index 2bd233f520..d1c5a1c9b3 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h @@ -27,22 +27,20 @@ #include "arm_compute/runtime/FunctionDescriptors.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" -#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" -#include "arm_compute/runtime/NEON/functions/NEPermute.h" -#include "arm_compute/runtime/Tensor.h" #include <memory> + namespace arm_compute { // Forward declarations class ITensor; -class NEGEMMAssemblyDispatch; +class ITensorInfo; -/** Basic function to compute the convolution layer. This function calls the following Neon kernels/functions: +/** Basic function to compute the convolution layer. This function calls the following kernels/functions: * * Supports only NHWC data layout * - * -# @ref NEGEMMAssemblyDispatch + * -# @ref cpu::CpuGemmAssemblyDispatch * -# @ref NEActivationLayer, in case activation cannot be fused in the assembly dispatch * * Weights are transformed from OHWI to HWIO format using the following kernels: @@ -65,6 +63,18 @@ public: ~NEGEMMConv2d(); /** Set the input and output tensors. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:--------------|:--------------|:--------------| + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |BFLOAT16 |BFLOAT16 |BFLOAT16 |BFLOAT16 | + * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. @@ -76,7 +86,8 @@ public: * Data types supported: Same as @p input. * @param[in] info Convolution layer descriptor */ - void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info); + void + configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConv2d * * @param[in] input Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], @@ -92,20 +103,19 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const Conv2dInfo &info); // Inherited methods overridden: void run() override; void prepare() override; private: - std::unique_ptr<NEGEMMAssemblyDispatch> _gemm_asm_func; - NEActivationLayer _activation_func; - NEPermute _weights_permute_func; - const ITensor *_original_weights; - Tensor _permuted_weights; - bool _is_prepared; - bool _run_activation; + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEGEMMCONV2D_H */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h index 18ccc9f015..3e84c3e2cf 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,147 +24,31 @@ #ifndef ARM_COMPUTE_NEGEMMCONVOLUTIONLAYER_H #define ARM_COMPUTE_NEGEMMCONVOLUTIONLAYER_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/IWeightsManager.h" #include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEGEMM.h" -#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" -#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" -#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" -#include "arm_compute/runtime/Tensor.h" #include <memory> namespace arm_compute { class ITensor; -class NECol2ImKernel; -class NEIm2ColKernel; -class NEWeightsReshapeKernel; - -/** Function to reshape the weights. This function calls the following kernel: - * -# @ref NEWeightsReshapeKernel - */ -class NEConvolutionLayerReshapeWeights : public IFunction -{ -public: - /** Constructor */ - NEConvolutionLayerReshapeWeights(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEConvolutionLayerReshapeWeights(const NEConvolutionLayerReshapeWeights &) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEConvolutionLayerReshapeWeights(NEConvolutionLayerReshapeWeights &&) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEConvolutionLayerReshapeWeights &operator=(const NEConvolutionLayerReshapeWeights &) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEConvolutionLayerReshapeWeights &operator=(NEConvolutionLayerReshapeWeights &&) = delete; - /** Default destructor */ - ~NEConvolutionLayerReshapeWeights(); - /** Set the input and output tensors. - * - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. - * Data type supported: All. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: same as @p weights. - * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. - * @param[out] output Destination tensor. Data types supported: same as @p weights. - */ - void configure(const ITensor *weights, const ITensor *biases, ITensor *output); - /** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayerReshapeWeights - * - * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. - * Data type supported: All. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: same as @p weights. - * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. - * @param[in] output Destination tensor. Data types supported: same as @p weights. - * - * @return an error status - */ - static Status validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output); - - // Inherited methods overridden: - void run() override; - -private: - std::unique_ptr<NEWeightsReshapeKernel> _weights_reshape_kernel; -}; - -namespace weights_transformations -{ -/** Basic function to manage the reshape weights generated from @ref NEConvolutionLayerReshapeWeights */ -class NEConvolutionLayerReshapeWeightsTransform : public ITransformWeights -{ -public: - /** Constructor */ - NEConvolutionLayerReshapeWeightsTransform() = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEConvolutionLayerReshapeWeightsTransform(const NEConvolutionLayerReshapeWeightsTransform &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEConvolutionLayerReshapeWeightsTransform &operator=(const NEConvolutionLayerReshapeWeightsTransform &) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEConvolutionLayerReshapeWeightsTransform(NEConvolutionLayerReshapeWeightsTransform &&) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEConvolutionLayerReshapeWeightsTransform &operator=(NEConvolutionLayerReshapeWeightsTransform &&) = delete; - /** Default destructor */ - ~NEConvolutionLayerReshapeWeightsTransform() = default; - void configure(const ITensor *input, const ITensor *biases) - { - _bias_bit = (biases != nullptr) ? 1 : 0; - _func.configure(input, biases, &_output); - } - - void run() override - { - _output.allocator()->allocate(); - _func.run(); - _reshape_run = true; - } - - ITensor *get_weights() override - { - return &_output; - } - - void release() override - { - _output.allocator()->free(); - } +class ITensorInfo; - uint32_t uid() override - { - return ((0x8) | (_bias_bit << 7)); - } - - bool is_reshape_run() - { - return _reshape_run; - } - -private: - Tensor _output{}; - NEConvolutionLayerReshapeWeights _func{}; - int32_t _bias_bit{ 0 }; -}; -} // namespace weights_transformations - -/** Basic function to compute the convolution layer. This function calls the following Neon kernels/functions: +/** Basic function to compute the convolution layer. This function calls the following kernels/functions: * - * -# @ref NEIm2ColKernel - * -# @ref NEGEMM (if the data type is BFLOAT16/FP16/FP32) - * -# @ref NEGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED) - * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if the data type is QASYMM8/QASYMM8_SIGNED) - * -# @ref NEArithmeticAddition (if biases != nullptr and we have a 1x1 convolution with the NHWC data layout) - * -# @ref NECol2ImKernel (if NCHW data layout) + * -# @ref cpu::CpuGemmConv2d * */ class NEGEMMConvolutionLayer : public IFunction { public: /** Constructor */ - NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr, IWeightsManager *weights_manager = nullptr); + NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr, + IWeightsManager *weights_manager = nullptr); /** Prevent instances of this class from being copied (As this class contains pointers) */ NEGEMMConvolutionLayer(const NEGEMMConvolutionLayer &) = delete; /** Prevent instances of this class from being moved (As this class contains non movable objects) */ @@ -177,118 +61,154 @@ public: ~NEGEMMConvolutionLayer(); /** Set the input and output tensors. * - * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. - * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. - * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. - * Data types supported: Same as @p input. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. - * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:--------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |BFLOAT16 |BFLOAT16 |BFLOAT16 |BFLOAT16 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | + * + * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported */ - void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(), - const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1); + void configure(const ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer * - * @param[in] input Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. - * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. - * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. - * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. - * @param[in] output Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. - * Data types supported: Same as @p input. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. - * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported + * @param[in] input Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. + * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * @param[in] output Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1); - - // Inherited methods overridden: - void run() override; - void prepare() override; - -private: - /** Configures the appropriate matrix multiply routine + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); + + /** Static function to check if there is an optimized version of + * GEMM available for the input parameters. * - * @param[in] input Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. - * @param[in] weights Weights tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. - * @param[out] output Output tensor. Data types supported: Same as @p input, - * except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. - * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1) - */ - void configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(), int gemm_3d_depth = 1); - /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines + * The method is intended to be used to find out the optimal + * memory layout to be used for the weights tensor when running + * variable weights execution. * - * @param[in] input Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. - * @param[in] weights Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. - * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. - * @param[in] output Output tensor info. Data types supported: Same as @p input, - * except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. - * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1) - * @param[in] skip_im2col (Optional) Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout. (Default to false) + * The user can query the database of optimised kernels in + * arm_gemm by specifying one of the enumerations of + * arm_compute::WeightFormat in the weight_format field of the input + * parameter weights_info. In case of success, the method + * writes the expected format in the output parameter + * expected_weight_format. The expected_weight_format can than be + * used in the configure method of the class for retrieving the + * best optimal kernel. * - * @return a status - */ - static Status validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(), - int gemm_3d_depth = 1, bool skip_im2col = false); - /** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref NEGEMMLowpMatrixMultiplyCore + * Use case one - query for a specific format: * - * @param[in] input_info Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. - * @param[in] weights_info Weights tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. - * @param[in] act_info Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. - * @param[in] gemm_3d_depth Depth of GEMM 3D - * @param[in] skip_im2col Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout + * WeightInfo weights_info(..., arm_compute::WeightFormat::OHWIo4, ...); // Set the value of the input query. + * if (NEGEMMConvolutionlayer::has_opt_impl(WeightFormat(), ...., weights_info, ...)) + * { + * auto conv = std::unique_ptr<NEGEMMConvolutionlayer>(); + * conv->configure(..., weights_info, ...); // uses the same WeightFormat the user wanted originally, OHWYo4. + * conv->run(...); + * } * - * @return a status + * Use case two - query for any format that would be optimal for the GEMM to execute: + * + * WeightInfo weights_info(..., arm_compute::WeightFormat::ANY, ...); // Set the value of the input query. + * arm_compute::WeightFormat expected_wf; + * if (NEGEMMConvolutionlayer::has_opt_impl(expected_wf, ...., weights_info, ...)) + * { + * auto conv = std::unique_ptr<NEGEMMConvolutionlayer>(); + * // ... code to convert the layout of the weights tensor to the layout returned by has_opt_impl + * WeightInfo new_weights_info(..., expected_wf, ...); // Set the value of the WeightFormat returned by has_opt_impl. + * conv->configure(..., new_weights_info, ...); + * conv->run(...); + * } + * + * Notice that a GEMM configured with a WeightFormat other than + * UNSPECIFIED will run GEMM with variable weights mode. + * + * @param[out] expected_weight_format The arm_compute::WeightFormat expected by the kernel. + * @param[in] src Source tensor info. + * @param[in] weights Weights tensor info. + * @param[in] biases Biases tensor info. Shared biases supported. + * @param[in] dst Destination tensor info. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] weights_info (optional) Specifies additional configuration parameters for the weights of the GEMM computation. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. And no activation (i.e. Linear) which is the default value. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * + * @return a Status */ - static Status validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col); + static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); + // Inherited methods overridden: + void run() override; + void prepare() override; private: - MemoryGroup _memory_group; - IWeightsManager *_weights_manager; - NEConvolutionLayerReshapeWeights _reshape_weights; - weights_transformations::NEConvolutionLayerReshapeWeightsTransform _reshape_weights_managed; - std::unique_ptr<NEIm2ColKernel> _im2col_kernel; - NEGEMM _mm_gemm; - NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp; - std::unique_ptr<NECol2ImKernel> _col2im_kernel; - NEReshapeLayer _reshape_layer; - - const ITensor *_original_weights; - const ITensor *_original_output; - - Tensor _im2col_output; - Tensor _weights_reshaped; - Tensor _gemm_output; - Tensor _gemm_output_3d; - Tensor _tmp_output; - - DataLayout _data_layout; - - bool _skip_im2col; - bool _skip_col2im; - bool _is_quantized; - bool _is_prepared; + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute -#endif /* ARM_COMPUTE_NECONVOLUTIONGEMMLAYER_H */ +#endif /* ARM_COMPUTE_NEGEMMCONVOLUTIONLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h index 821b498dad..6d07675d3d 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,50 +21,34 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H -#define ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEGEMMLOWPMATRIXMULTIPLYCORE_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEGEMMLOWPMATRIXMULTIPLYCORE_H -#include "NEActivationLayer.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/function_info/GEMMInfo.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/IWeightsManager.h" -#include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/Tensor.h" #include <memory> namespace arm_compute { class ITensor; -class NEConvertQuantizedSignednessKernel; -class NEConvertQuantizedSignednessKernel; -class NEGEMMInterleave4x4Kernel; -class NEGEMMLowpMatrixMultiplyKernel; -class NEGEMMLowpOffsetContributionKernel; -class NEGEMMLowpOffsetContributionOutputStageKernel; -class NEGEMMLowpMatrixAReductionKernel; -class NEGEMMLowpMatrixBReductionKernel; -class NEGEMMTranspose1xWKernel; -class NEGEMMAssemblyDispatch; +class ITensorInfo; -/** Basic function to execute GEMMLowpMatrixMultiplyCore on Neon. This function calls the following Neon kernels if the DOT product instruction is not available: +/** Function to run Gemm on quantized types. * - * -# @ref NEGEMMInterleave4x4Kernel - * -# @ref NEGEMMTranspose1xWKernel - * -# @ref NEGEMMLowpMatrixMultiplyKernel - * -# @ref NEGEMMLowpOffsetContributionKernel - * -# @ref NEActivationLayer + * This function calls the following: * - * otherwise if the DOT product instruction is available: - * - * -# @ref NEGEMMLowpOffsetContributionKernel - * -*/ + * -# @ref cpu::CpuGemmLowpMatrixMultiplyCore + */ class NEGEMMLowpMatrixMultiplyCore : public IFunction { public: /** Constructor */ - NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr); + NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr, + IWeightsManager *weights_manager = nullptr); /** Prevent instances of this class from being copied (As this class contains pointers) */ NEGEMMLowpMatrixMultiplyCore(const NEGEMMLowpMatrixMultiplyCore &) = delete; /** Default move constructor */ @@ -77,6 +61,27 @@ public: ~NEGEMMLowpMatrixMultiplyCore(); /** Initialise the kernel's inputs, output * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:--------|:--------------| + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | + * |QASYMM8 |QSYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QASYMM8 |S32 |S32 | + * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |S32 | + * |QASYMM8 |QSYMM8 |S32 |S32 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QSYMM8 |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |S32 | + * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |S32 | + * |QASYMM8_SIGNED |QSYMM8 |S32 |S32 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |F32 |F32 | + * * @note GEMM_LOWP: low precision GEMM kernel * This kernel performs the following computations: * @@ -84,69 +89,36 @@ public: * -# Convert b values from QASYMM8 to int32 add b_offset to each of them. * -# Compute the matrix product of the resulting a * b in int32. * - * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise + * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED/F32 otherwise * * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED. * @param[in] b Second input tensor (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL. - * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported: S32 - * @param[out] output Output tensor. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED + * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported: S32/F32 + * @param[out] output Output tensor. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED/F32 * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and * if the reshape of matrix B should be executed only for the first run */ - void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo()); + void configure( + const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyCore * - * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise - * - * @param[in] a First input tensor info (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED. - * @param[in] b Second input tensor info (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL. - * @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type supported: S32 - * @param[in] output Output tensor info. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED - * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and - * if the reshape of matrix B should be executed only for the first run + * Similar to @ref NEGEMMLowpMatrixMultiplyCore::configure() * * @return a status */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo()); + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info = GEMMInfo()); // Inherited methods overridden void run() override; void prepare() override; private: - MemoryGroup _memory_group; - IWeightsManager *_weights_manager; - std::unique_ptr<NEGEMMAssemblyDispatch> _asm_glue; - std::unique_ptr<NEGEMMLowpMatrixMultiplyKernel> _mm_kernel; - std::unique_ptr<NEGEMMInterleave4x4Kernel> _mtx_a_reshape_kernel; - std::unique_ptr<NEGEMMTranspose1xWKernel> _mtx_b_reshape_kernel; - std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _mtx_a_reduction_kernel; - std::unique_ptr<NEGEMMLowpMatrixBReductionKernel> _mtx_b_reduction_kernel; - std::unique_ptr<NEGEMMLowpOffsetContributionKernel> _offset_contribution_kernel; - std::unique_ptr<NEGEMMLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel; - NEActivationLayer _activation_func; - std::unique_ptr<NEConvertQuantizedSignednessKernel> _convert_to_signed_asymm; - std::unique_ptr<NEConvertQuantizedSignednessKernel> _convert_from_signed_asymm; - - Tensor _vector_sum_col; - Tensor _vector_sum_row; - Tensor _tmp_a; - Tensor _tmp_b; - Tensor _mm_result_s32; - Tensor _signed_a; - Tensor _signed_output; - const ITensor *_original_b; - int32_t _a_offset; - int32_t _b_offset; - - bool _run_vector_matrix_multiplication; - bool _assembly_path; - bool _fused_assembly_path; - bool _reshape_b_only_on_first_run; - bool _is_prepared; - bool _fuse_output_stage; - bool _run_activation; - bool _flip_signedness; + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute -#endif /*ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEGEMMLOWPMATRIXMULTIPLYCORE_H diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h index 79b427ea6f..0d932bb4af 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h @@ -25,9 +25,9 @@ #define ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H #include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +#include "arm_compute/runtime/IFunction.h" -/** This file contains all available output stages for GEMMLowp on Neon. +/** This file contains all available output stages for GEMMLowp. * * In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyCore), * and processes it to obtain the final ASYMM8 value. @@ -39,237 +39,17 @@ namespace arm_compute { class ITensor; class ITensorInfo; - -/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint on Neon. - * - * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint depends on 3 parameters: - * - * result_fixedpoint_multiplier, result_shift, result_offset_after_shift - * - * The final result is: - * - * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift - * - * where FixedPointMul(x, y) is the nearest integer to the following - * mathematical expression, evaluated without overflow or intermediate rounding: - * - * (x * y) / 2^31 - * - * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68 - * - * In case the bias tensor is provided, the final result is: - * - * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift - * - * This function calls the following Neon kernels: - * - * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel - * - * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions - * after the result is shifted right by result_shift -*/ -class NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint : public INESimpleFunctionNoBorder -{ -public: - /** Constructor */ - NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint() = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &operator=(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &&) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &operator=(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &&) = delete; - /** Default destructor */ - ~NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint(); - /** Initialise the kernel's inputs, output - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8 - * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add - * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication - * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8 - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - */ - void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); - /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint - * - * @param[in] input Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8 - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); -}; -/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint on Neon. - * - * NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint depends on 3 parameters: - * - * result_fixedpoint_multiplier, result_shift, result_offset_after_shift - * - * The final result is: - * - * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift - * - * where FixedPointMul(x, y) is the nearest integer to the following - * mathematical expression, evaluated without overflow or intermediate rounding: - * - * (x * y) / 2^31 - * - * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68 - * - * In case the bias tensor is provided, the final result is: - * - * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift - * - * This function calls the following Neon kernels: - * - * -# @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel - * - * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions - * after the result is shifted right by result_shift -*/ -class NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint : public INESimpleFunctionNoBorder -{ -public: - /** Constructor */ - NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint() = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint(const NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &operator=(const NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &&) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &operator=(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &&) = delete; - /** Default destructor */ - ~NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint(); - /** Initialise the kernel's inputs, output - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED - * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add - * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication - * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8_SIGNED - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED, - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - */ - void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); - /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint - * - * @param[in] input Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED, - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); -}; -/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint on Neon. - * - * NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint depends on 2 parameters: +/** Basic function to execute GEMMLowpQuantizeDown kernels. * - * result_fixedpoint_multiplier, result_shift + * This function calls the following operators: * - * The final result is: - * - * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) - * - * where FixedPointMul(x, y) is the nearest integer to the following - * mathematical expression, evaluated without overflow or intermediate rounding: - * - * (x * y) / 2^31 - * - * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68 - * - * In case the bias tensor is provided, the final result is: - * - * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift - * - * This function calls the following Neon kernels: - * - * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel - * - * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions - * after the result is shifted right by result_shift + * -# @ref cpu::CpuGemmLowpOutputStage */ -class NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint : public INESimpleFunctionNoBorder +class NEGEMMLowpOutputStage : public IFunction { public: /** Constructor */ - NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint() = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint(const NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &operator=(const NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &&) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &operator=(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &&) = delete; - /** Default destructor */ - ~NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint(); - /** Initialise the kernel's inputs, output - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: Data type supported: QSYMM16 - * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add - * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16. - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - */ - void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int min = std::numeric_limits<int32_t>::lowest(), - int max = std::numeric_limits<int32_t>::max()); - /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint - * - * @param[in] input Input tensor info. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32 - * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor info. Data type supported: Data type supported: QSYMM16 - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16, - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); -}; - -/** Basic function to execute GEMMLowpQuantizeDown kernels on Neon. - * - * This function calls the following Neon kernels: - * - * -# @ref NEGEMMLowpQuantizeDownInt32ScaleKernel - * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel - * -# @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel - * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel -*/ -class NEGEMMLowpOutputStage : public INESimpleFunctionNoBorder -{ -public: - /** Constructor */ - NEGEMMLowpOutputStage() = default; + NEGEMMLowpOutputStage(); /** Prevent instances of this class from being copied (As this class contains pointers) */ NEGEMMLowpOutputStage(const NEGEMMLowpOutputStage &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ @@ -282,6 +62,16 @@ public: ~NEGEMMLowpOutputStage(); /** Initialise the kernel's inputs, output * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |dst | + * |:--------------|:-------------|:-------------| + * |S32 |S32 |QASYMM8 | + * |S32 |S32 |QASYMM8_SIGNED| + * |S32 |S32 |QSYMM16 | + * * @param[in] input Input tensor. Data type supported: S32 * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. @@ -299,7 +89,17 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const GEMMLowpOutputStageInfo &info); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute #endif /*ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H */ diff --git a/arm_compute/runtime/NEON/functions/NEGather.h b/arm_compute/runtime/NEON/functions/NEGather.h index a5e0461227..9c7ae0134d 100644 --- a/arm_compute/runtime/NEON/functions/NEGather.h +++ b/arm_compute/runtime/NEON/functions/NEGather.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -40,19 +40,26 @@ class NEGather : public INESimpleFunctionNoBorder public: /** Initialise the kernel's inputs and outputs * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |All |All | + * * @param[in] input Source tensor. Supported tensor rank: up to 4. Data type supported: All - * @param[in] indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following type: U32/S32. Each value Must be in range [0, input.shape[@p axis]) + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the following type: U32/S32. Each value must be in range [0, input.shape[@p axis]), otherwise the result will become unpredictable. + * @note The "axis" must be in the range [0, input.rank -1] when indices is a vector, and must be 1 when indices is a 2D or 3D tensor. * @param[out] output Destination tensor. Data type supported: Same as @p input * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 + * */ void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0); - /** Static function to check if given info will lead to a valid configuration of @ref NEGatherKernel + /** Static function to check if given info will lead to a valid configuration * - * @param[in] input Source tensor info. Supported tensor rank: up to 4. Data type supported: All - * @param[in] indices Indices tensor info. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value Must be in range [0, input.shape[@p axis]) - * @param[in] output Destination tensor info. Data type supported: Same as @p input - * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 + * Similar to @ref NEGather::configure() * * @return a status */ diff --git a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h index 90e2307ce8..0f294fde22 100644 --- a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h +++ b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h @@ -44,14 +44,14 @@ class NEComputeAllAnchorsKernel; /** Basic function to generate proposals for a RPN (Region Proposal Network) * - * This function calls the following Neon kernels: + * This function calls the following Arm(R) Neon(TM) layers/kernels: * -# @ref NEComputeAllAnchorsKernel * -# @ref NEPermute x 2 * -# @ref NEReshapeLayer x 2 * -# @ref NEBoundingBoxTransform * -# @ref NEPadLayerKernel - * -# @ref NEDequantizationLayerKernel x 2 - * -# @ref NEQuantizationLayerKernel + * -# @ref NEDequantizationLayer x 2 + * -# @ref NEQuantizationLayer * And the following CPP kernels: * -# @ref CPPBoxWithNonMaximaSuppressionLimit */ @@ -72,6 +72,16 @@ public: /** Set the input and output tensors. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:--------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |QASYMM8 |QSYMM8 |QSYMM16 |QASYMM8 | + * * @param[in] scores Scores from convolution layer of size (W, H, A), where H and W are the height and width of the feature map, and A is the number of anchors. * Data types supported: QASYMM8/F16/F32 * @param[in] deltas Bounding box deltas from convolution layer of size (W, H, 4*A). Data types supported: Same as @p scores @@ -85,7 +95,12 @@ public: * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the @ref GenerateProposalsInfo struct. * @note Proposals contains all the proposals. Of those, only the first num_valid_proposals are valid. */ - void configure(const ITensor *scores, const ITensor *deltas, const ITensor *anchors, ITensor *proposals, ITensor *scores_out, ITensor *num_valid_proposals, + void configure(const ITensor *scores, + const ITensor *deltas, + const ITensor *anchors, + ITensor *proposals, + ITensor *scores_out, + ITensor *num_valid_proposals, const GenerateProposalsInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref NEGenerateProposalsLayer @@ -102,7 +117,11 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out, + static Status validate(const ITensorInfo *scores, + const ITensorInfo *deltas, + const ITensorInfo *anchors, + const ITensorInfo *proposals, + const ITensorInfo *scores_out, const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info); @@ -113,7 +132,7 @@ private: // Memory group manager MemoryGroup _memory_group; - // Neon kernels + // kernels/layers NEPermute _permute_deltas; NEReshapeLayer _flatten_deltas; NEPermute _permute_scores; diff --git a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h index 57165c94b4..0bc57be09e 100644 --- a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h +++ b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -60,6 +60,16 @@ public: ~NEInstanceNormalizationLayer(); /** Set the input and output tensors. * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src |dst | + * |:--------|:---------| + * |F16 |F16 | + * |F32 |F32 | + * * @param[in, out] input Source tensor. In case of @p output tensor = nullptr this tensor will store the result of the normalization. * Data types supported: F16/F32. Data layout supported: NHWC, NCHW * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input. @@ -79,7 +89,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + float gamma = 1.0f, + float beta = 0.0f, + float epsilon = 1e-12f); // Inherited methods overridden: void run() override; @@ -93,5 +107,5 @@ private: Tensor _permuted_input; Tensor _permuted_output; }; -} +} // namespace arm_compute #endif /* ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h index 173b9d2141..8502cee5d2 100644 --- a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h +++ b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -60,6 +60,16 @@ public: ~NEL2NormalizeLayer(); /** Set the input and output tensors. * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src |dst | + * |:--------|:---------| + * |F16 |F16 | + * |F32 |F32 | + * * @param[in, out] input Source tensor. Data types supported: F16/F32. (Written to only for border_size != 0) * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input. * @param[in] axis Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2 @@ -87,5 +97,5 @@ private: std::unique_ptr<NEL2NormalizeLayerKernel> _normalize_kernel; Tensor _sumsq; }; -} +} // namespace arm_compute #endif /* ARM_COMPUTE_NEL2NORMALIZELAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayer.h b/arm_compute/runtime/NEON/functions/NELSTMLayer.h index ef8defb827..629c5d10a0 100644 --- a/arm_compute/runtime/NEON/functions/NELSTMLayer.h +++ b/arm_compute/runtime/NEON/functions/NELSTMLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,7 +25,8 @@ #define ARM_COMPUTE_NELSTMLAYER_H #include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" +#include "arm_compute/runtime/common/LSTMParams.h" +#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h" @@ -35,7 +36,6 @@ #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h" #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h" #include "arm_compute/runtime/NEON/functions/NETranspose.h" -#include "arm_compute/runtime/common/LSTMParams.h" namespace arm_compute { @@ -60,6 +60,15 @@ public: ~NELSTMLayer(); /** Initialize function's tensors. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 - src13 | dst0 - dst3 | + * |:------------|:------------| + * |F16 |F16 | + * |F32 |F32 | + * * @param[in] input Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: F16/F32. * @param[in] input_to_forget_weights 2D weights tensor with dimensions [input_size, num_units]. Data type supported: Same as @p input. * @param[in] input_to_cell_weights 2D weights tensor with dimensions [input_size, num_units]. Data type supported: Same as @p input. @@ -95,13 +104,26 @@ public: * @param[in] projection_threshold The clipping threshold for the output from the projection layer, such that values are bound within [-proj_clip, proj_clip]. * If set to 0.0 then clipping is disabled. */ - void configure(const ITensor *input, - const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights, - const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights, - const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias, - const ITensor *output_state_in, const ITensor *cell_state_in, - ITensor *scratch_buffer, ITensor *output_state_out, ITensor *cell_state_out, ITensor *output, - const LSTMParams<ITensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f); + void configure(const ITensor *input, + const ITensor *input_to_forget_weights, + const ITensor *input_to_cell_weights, + const ITensor *input_to_output_weights, + const ITensor *recurrent_to_forget_weights, + const ITensor *recurrent_to_cell_weights, + const ITensor *recurrent_to_output_weights, + const ITensor *forget_gate_bias, + const ITensor *cell_bias, + const ITensor *output_gate_bias, + const ITensor *output_state_in, + const ITensor *cell_state_in, + ITensor *scratch_buffer, + ITensor *output_state_out, + ITensor *cell_state_out, + ITensor *output, + const LSTMParams<ITensor> &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold = 0.f, + float projection_threshold = 0.f); /** Static function to check if given info will lead to a valid configuration of @ref NELSTMLayer * @@ -142,13 +164,26 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in, - const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output, - const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f); + static Status validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_in, + const ITensorInfo *scratch_buffer, + const ITensorInfo *output_state_out, + const ITensorInfo *cell_state_out, + const ITensorInfo *output, + const LSTMParams<ITensorInfo> &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold = 0.f, + float projection_threshold = 0.f); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h index a59dcf88cc..ae951669b3 100644 --- a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h +++ b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NELSTMLAYERQUANTIZED_H #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/common/LSTMParams.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h" @@ -38,8 +39,6 @@ #include "arm_compute/runtime/NEON/functions/NESlice.h" #include "arm_compute/runtime/NEON/functions/NETranspose.h" -#include "arm_compute/runtime/common/LSTMParams.h" - namespace arm_compute { // Forward declarations @@ -47,10 +46,10 @@ class ITensor; /** Basic function to run @ref NELSTMLayerQuantized * - * This function calls the following Neon functions/kernels: + * This function calls the following functions/kernels: * * -# @ref NEGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers - * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16 + * -# @ref NEGEMMLowpOutputStage Convert 32-bit integers into QSYMM16 * -# @ref NETranspose Matrix transpose * -# @ref NEConcatenateLayer Tensor concatenation * -# @ref NEActivationLayer Activation functions (tanh and logistic) @@ -77,6 +76,14 @@ public: ~NELSTMLayerQuantized(); /** Initialize function's tensors. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 - src8 |src9 - src12 |src13 |src14 |dst0 |dst1 | + * |:-----------|:------------|:-------|:------|:------|:------| + * |QASYMM8 |S32 |QSYMM16 |QASYMM8|QSYMM16|QASYMM8| + * * @param[in] input Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8. * @param[in] input_to_input_weights 2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input. * @param[in] input_to_forget_weights 2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input. @@ -96,11 +103,22 @@ public: * @param[out] output_state_out Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input. */ void configure(const ITensor *input, - const ITensor *input_to_input_weights, const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights, - const ITensor *recurrent_to_input_weights, const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights, - const ITensor *input_gate_bias, const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias, - ITensor *cell_state_in, const ITensor *output_state_in, - ITensor *cell_state_out, ITensor *output_state_out); + const ITensor *input_to_input_weights, + const ITensor *input_to_forget_weights, + const ITensor *input_to_cell_weights, + const ITensor *input_to_output_weights, + const ITensor *recurrent_to_input_weights, + const ITensor *recurrent_to_forget_weights, + const ITensor *recurrent_to_cell_weights, + const ITensor *recurrent_to_output_weights, + const ITensor *input_gate_bias, + const ITensor *forget_gate_bias, + const ITensor *cell_bias, + const ITensor *output_gate_bias, + ITensor *cell_state_in, + const ITensor *output_state_in, + ITensor *cell_state_out, + ITensor *output_state_out); /** Static function to check if given info will lead to a valid configuration of @ref NELSTMLayer * @@ -125,11 +143,22 @@ public: * @return a status */ static Status validate(const ITensorInfo *input, - const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out); + const ITensorInfo *input_to_input_weights, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_input_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *input_gate_bias, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out); // Inherited methods overridden: void run() override; @@ -139,30 +168,30 @@ private: MemoryGroup _memory_group; // Functions used - NEGEMMLowpMatrixMultiplyCore _gemmlowp; - NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint _output_stage; - NETranspose _transpose_weights; - NEConcatenateLayer _concat_input_weights; - NEConcatenateLayer _concat_recurrent_weights; - NEConcatenateLayer _concat_weights; - NEConcatenateLayer _concat_inputs; - NEConcatenateLayer _concat_bias; - NEActivationLayer _sigmoid_forget_gate; - NEActivationLayer _sigmoid_input_gate; - NEActivationLayer _sigmoid_output_gate; - NEActivationLayer _tanh_modulation_gate; - NEActivationLayer _tanh_output_state; - NEArithmeticAddition _add1; - NEArithmeticAddition _add2; - NEPixelWiseMultiplication _mul1; - NEPixelWiseMultiplication _mul2; - NEPixelWiseMultiplication _mul3; - NESlice _slice_input_tensor; - NESlice _slice_forget_tensor; - NESlice _slice_cell_tensor; - NESlice _slice_output_tensor; - NEDequantizationLayer _dequantize; - NEQuantizationLayer _quantize; + NEGEMMLowpMatrixMultiplyCore _gemmlowp; + NEGEMMLowpOutputStage _output_stage; + NETranspose _transpose_weights; + NEConcatenateLayer _concat_input_weights; + NEConcatenateLayer _concat_recurrent_weights; + NEConcatenateLayer _concat_weights; + NEConcatenateLayer _concat_inputs; + NEConcatenateLayer _concat_bias; + NEActivationLayer _sigmoid_forget_gate; + NEActivationLayer _sigmoid_input_gate; + NEActivationLayer _sigmoid_output_gate; + NEActivationLayer _tanh_modulation_gate; + NEActivationLayer _tanh_output_state; + NEArithmeticAddition _add1; + NEArithmeticAddition _add2; + NEPixelWiseMultiplication _mul1; + NEPixelWiseMultiplication _mul2; + NEPixelWiseMultiplication _mul3; + NESlice _slice_input_tensor; + NESlice _slice_forget_tensor; + NESlice _slice_cell_tensor; + NESlice _slice_output_tensor; + NEDequantizationLayer _dequantize; + NEQuantizationLayer _quantize; // Tensor pointers const ITensor *_input_to_input_weights; diff --git a/arm_compute/runtime/NEON/functions/NELogical.h b/arm_compute/runtime/NEON/functions/NELogical.h index 04ffce6221..0ad23200c6 100644 --- a/arm_compute/runtime/NEON/functions/NELogical.h +++ b/arm_compute/runtime/NEON/functions/NELogical.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,7 +26,6 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/runtime/Macros.h" #include <memory> @@ -42,12 +41,27 @@ class NELogicalAnd : public IFunction public: /** Constructor */ NELogicalAnd(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NELogicalAnd(const NELogicalAnd &) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + NELogicalAnd(NELogicalAnd &&) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NELogicalAnd &operator=(const NELogicalAnd &) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + NELogicalAnd &operator=(NELogicalAnd &&) = delete; /** Destructor */ ~NELogicalAnd(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE_INC(NELogicalAnd) /** Initialise the kernel's inputs and output * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |dst | + * |:--------------|:-------------|:------------| + * |U8 |U8 |U8 | + * * @param[in] input1 First tensor input. Data type supported: U8. * @param[in] input2 Second tensor input. Data type supported: U8. * @param[out] output Output tensor. Data type supported: U8. @@ -77,12 +91,27 @@ class NELogicalOr : public IFunction public: /** Constructor */ NELogicalOr(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NELogicalOr(const NELogicalOr &) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + NELogicalOr(NELogicalOr &&) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NELogicalOr &operator=(const NELogicalOr &) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + NELogicalOr &operator=(NELogicalOr &&) = delete; /** Destructor */ ~NELogicalOr(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE_INC(NELogicalOr) /** Initialise the kernel's inputs and output * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |dst | + * |:--------------|:-------------|:------------| + * |U8 |U8 |U8 | + * * @param[in] input1 First tensor input. Data type supported: U8. * @param[in] input2 Second tensor input. Data type supported: U8. * @param[out] output Output tensor. Data type supported: U8. @@ -112,12 +141,27 @@ class NELogicalNot : public IFunction public: /** Constructor */ NELogicalNot(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NELogicalNot(const NELogicalNot &) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + NELogicalNot(NELogicalNot &&) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NELogicalNot &operator=(const NELogicalNot &) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + NELogicalNot &operator=(NELogicalNot &&) = delete; /** Destructor */ ~NELogicalNot(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE_INC(NELogicalNot) /** Initialise the kernel's inputs and output * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:-------------| + * |U8 |U8 | + * * @param[in] input Input tensor. Data type supported: U8. * @param[out] output Output tensor. Data type supported: U8. */ diff --git a/arm_compute/runtime/NEON/functions/NEMatMul.h b/arm_compute/runtime/NEON/functions/NEMatMul.h new file mode 100644 index 0000000000..58dd7a6f6b --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEMatMul.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL_H + +#include "arm_compute/core/Types.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" +#include "arm_compute/runtime/IFunction.h" + +#include <memory> + +namespace arm_compute +{ +/** Settings for MatMul Cpu implementation*/ +class CpuMatMulSettings +{ +public: + // get fast math flag + bool fast_math() const + { + return _fast_math; + } + // get fixed format flag + bool fixed_format() const + { + return _fixed_format; + } + // Set fast math flag + CpuMatMulSettings &fast_math(bool fmath) + { + _fast_math = fmath; + return *this; + } + // Set fixed format flag + CpuMatMulSettings &fixed_format(bool fixed_format) + { + _fixed_format = fixed_format; + return *this; + } + +private: + bool _fast_math{false}; + bool _fixed_format{false}; +}; + +// Forward declarations +class ITensor; +class ITensorInfo; +class MatMulInfo; +class Status; + +/** Basic function to run the following operators: + * + * -# @ref cpu::CpuMatMul + */ +class NEMatMul : public IFunction +{ +public: + /** Constructor */ + NEMatMul(); + /** Destructor */ + ~NEMatMul(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEMatMul(const NEMatMul &) = delete; + /** Default move constructor */ + NEMatMul(NEMatMul &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEMatMul &operator=(const NEMatMul &) = delete; + /** Default move assignment operator */ + NEMatMul &operator=(NEMatMul &&) = default; + /** Initialize + * + * Valid data layouts: + * - Any + * + * Valid data type configurations: + * |lhs |rhs |dst | + * |:--------------|:------------------|:--------------| + * |F32 |F32 |F32 | + * |F16 |F16 |F16 | + * |BFLOAT16 |BFLOAT16 |BFLOAT16 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |QASYMM8 |QASYMM8 |QASYMM8 | + * + * @param[in] lhs Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8. + * @param[in] rhs Right-hand side tensor info. Data types supported: same as @p lhs. + * @param[out] dst Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs. + * @param[in] info Contains MatMul operation information described in @ref MatMulInfo. + * @param[in] settings Contains flags for function level settings i.e fast math + * @param[in] act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions. + */ + void configure(ITensor *lhs, + ITensor *rhs, + ITensor *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref NEMatMul + * + * @param[in] lhs Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8. + * @param[in] rhs Right-hand side tensor info. Data types supported: same as @p lhs. + * @param[out] dst Output tensor info to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs. + * @param[in] info Contains MatMul operation information described in @ref MatMulInfo. + * @param[in] settings Contains flags for function level settings i.e fast math + * @param[in] act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions. + * + * @return Status + */ + static Status validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden + void run() override; + +private: + struct Impl; + std::unique_ptr<Impl> _impl; +}; +} // namespace arm_compute +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL_H diff --git a/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h index 7b1f7e9ca1..e00fc4544f 100644 --- a/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h +++ b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,6 +26,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" + #include <memory> namespace arm_compute @@ -33,12 +34,10 @@ namespace arm_compute class ITensor; class ITensorInfo; class NEFill; -class NEMaxUnpoolingLayerKernel; -/** Function to perform MaxUnpooling. This function calls the following Neon kernels: +/** Function to perform MaxUnpooling. This function calls the following kernels: * * -# @ref NEFill - * -# @ref NEMaxUnpoolingLayerKernel */ class NEMaxUnpoolingLayer : public IFunction { @@ -57,6 +56,18 @@ public: ~NEMaxUnpoolingLayer(); /** Set the input and output tensors. * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |F16 |F16 | + * |F32 |F32 | + * * @note Only supported pool size 2 * * @param[in, out] input Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. @@ -76,14 +87,18 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *indices, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info); // Inherited methods overridden: void run() override; private: - std::unique_ptr<NEFill> _fill_func; - std::unique_ptr<NEMaxUnpoolingLayerKernel> _unpooling_layer_kernel; + std::unique_ptr<NEFill> _fill_func; + struct Impl; + std::unique_ptr<Impl> _impl; }; -} +} // namespace arm_compute #endif /* ARM_COMPUTE_NEMAXUNPOOLINGLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h index 31e376191c..41aa81946b 100644 --- a/arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h +++ b/arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -50,6 +50,16 @@ public: ~NEMeanStdDevNormalizationLayer(); /** Initialise the function's input and outputs. * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src |dst | + * |:--------|:---------| + * |F32 |F32 | + * |F16 |F16 | + * * @note If the output tensor is a nullptr, the normalization will be performed in-place. * * @param[in, out] input Input tensor with 2 dimensions. Data types supported: F16/F32. diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h index fbc2f6f95b..27e3fa674e 100644 --- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h +++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NENORMALIZATIONLAYER_H #define ARM_COMPUTE_NENORMALIZATIONLAYER_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h" @@ -39,7 +38,7 @@ namespace arm_compute class ITensor; class NENormalizationLayerKernel; -/** Basic function to compute a normalization layer. This function calls the following Neon kernels: +/** Basic function to compute a normalization layer. This function calls the following kernels: * * -# @ref NEPixelWiseMultiplication * -# @ref NEFillBorderKernel @@ -63,6 +62,16 @@ public: ~NENormalizationLayer(); /** Set the input and output tensors. * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src |dst | + * |:--------|:---------| + * |F32 |F32 | + * |F16 |F16 | + * * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], * and an optional 4th dimension for batch of inputs. Data type supported: F16/F32. Data layouts supported: NCHW/NHWC. * @param[out] output Destination with the same dimensions, data type, data layout and number of channels of @p input @@ -78,16 +87,17 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info); // Inherited methods overridden: void run() override; private: - MemoryGroup _memory_group; /**< Function memory group */ - std::unique_ptr<NENormalizationLayerKernel> _norm_kernel; /**< Normalization layer kernel */ - NEPixelWiseMultiplication _multiply_f; /**< Pixel multiplication function */ - Tensor _input_squared; /**< The intermediate buffer which stores results of squaring input */ + MemoryGroup _memory_group; /**< Function memory group */ + std::unique_ptr<NENormalizationLayerKernel> _norm_kernel; /**< Normalization layer kernel */ + NEPixelWiseMultiplication _multiply_f; /**< Pixel multiplication function */ + Tensor _input_squared; /**< The intermediate buffer which stores results of squaring input */ }; -} +} // namespace arm_compute #endif /* ARM_COMPUTE_NENORMALIZATIONLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEPReluLayer.h b/arm_compute/runtime/NEON/functions/NEPReluLayer.h index 12ffb8da7b..81d5fd162c 100644 --- a/arm_compute/runtime/NEON/functions/NEPReluLayer.h +++ b/arm_compute/runtime/NEON/functions/NEPReluLayer.h @@ -26,41 +26,14 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/runtime/NEON/INEOperator.h" + +#include <memory> namespace arm_compute { class ITensor; class ITensorInfo; -namespace experimental -{ -/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for PRELU - * - * @note The function implements an activation layer with the PRELU activation function. - */ -class NEPRelu : public INEOperator -{ -public: - /** Set the input and output tensor. - * - * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] alpha Source alpha tensor info. Data types supported: same of @p input. - * @param[out] output Destination tensor info. Data type supported: same as @p input - */ - void configure(const ITensorInfo *input, const ITensorInfo *alpha, ITensorInfo *output); - /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuComparisonKernel - * - * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] alpha Source alpha tensor info. Data types supported: same of @p input. - * @param[in] output Destination tensor info. Data type supported: same as @p input - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output); -}; -} // namespace experimental - /** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for PRELU * * @note The function implements an activation layer with the PRELU activation function. @@ -82,6 +55,17 @@ public: NEPReluLayer &operator=(NEPReluLayer &&); /** Set the input and output tensor. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |F16 |F16 | + * |F32 |F32 | + * * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. * @param[in] alpha Source alpha tensor. Data types supported: same of @p input. * @param[out] output Destination tensor. Data type supported: same as @p input diff --git a/arm_compute/runtime/NEON/functions/NEPadLayer.h b/arm_compute/runtime/NEON/functions/NEPadLayer.h index 242625604f..494b1c0641 100644 --- a/arm_compute/runtime/NEON/functions/NEPadLayer.h +++ b/arm_compute/runtime/NEON/functions/NEPadLayer.h @@ -24,21 +24,21 @@ #ifndef ARM_COMPUTE_NEPADLAYER_H #define ARM_COMPUTE_NEPADLAYER_H +#include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h" #include "arm_compute/runtime/NEON/functions/NECopy.h" #include "arm_compute/runtime/NEON/functions/NEStridedSlice.h" #include "arm_compute/runtime/SubTensor.h" - -#include "arm_compute/core/Types.h" #include "arm_compute/runtime/Tensor.h" + #include <memory> namespace arm_compute { class NEPadLayerKernel; -/** Basic function to pad a tensor. This function calls the following Neon functions/kernels: +/** Basic function to pad a tensor. This function calls the following functions/kernels: * * - For padding mode = PaddingMode::CONSTANT: * -# @ref NEPadLayerKernel @@ -65,6 +65,15 @@ public: ~NEPadLayer(); /** Initialize the function * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src |dst | + * |:--------|:---------| + * |All |All | + * * @param[in] input Source tensor. Data types supported: All. * @param[out] output Output tensor. Data type supported: same as @p input * @param[in] padding The padding for each spatial dimension of the input tensor. The pair padding[i] @@ -73,7 +82,11 @@ public: * @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT, * or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT). */ - void configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT); + void configure(ITensor *input, + ITensor *output, + const PaddingList &padding, + const PixelValue constant_value = PixelValue(), + const PaddingMode mode = PaddingMode::CONSTANT); /** Static function to check if given info will lead to a valid configuration of @ref NEPadLayer. * * @param[in] input Source tensor info. Data types supported: All. @@ -86,7 +99,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + const PixelValue constant_value = PixelValue(), + const PaddingMode mode = PaddingMode::CONSTANT); // Inherited methods overridden: void run() override; @@ -100,7 +117,10 @@ private: * specifies the front and the end padding in the i-th dimension. * @param[in] constant_value Constant value to be used for the padding */ - void configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value); + void configure_constant_mode(ITensor *input, + ITensor *output, + const PaddingList &padding, + const PixelValue constant_value); /** Configure functions for when reflect or symmetric padding is used. * * @param[in] input Source tensor. Data types supported: All. diff --git a/arm_compute/runtime/NEON/functions/NEPermute.h b/arm_compute/runtime/NEON/functions/NEPermute.h index cf7e25213b..2cef64764d 100644 --- a/arm_compute/runtime/NEON/functions/NEPermute.h +++ b/arm_compute/runtime/NEON/functions/NEPermute.h @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NEPERMUTE_H #define ARM_COMPUTE_NEPERMUTE_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" #include <memory> @@ -47,12 +46,21 @@ public: /** Prevent instances of this class from being copied (As this class contains pointers) */ NEPermute(const NEPermute &) = delete; /** Default move constructor */ - NEPermute(NEPermute &&); + NEPermute(NEPermute &&) = default; /** Prevent instances of this class from being copied (As this class contains pointers) */ NEPermute &operator=(const NEPermute &) = delete; /** Default move assignment operator */ - NEPermute &operator=(NEPermute &&); - /** Configure the permute Neon kernel + NEPermute &operator=(NEPermute &&) = default; + /** Configure the permute function + * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src |dst | + * |:------|:------| + * |All |All | * * @note Arbitrary permutation vectors are supported with rank not greater than 4 * diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h index 91cf44ff2e..3d81bf6087 100644 --- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h +++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,115 +24,19 @@ #ifndef ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H #define ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H +#include "arm_compute/core/Rounding.h" #include "arm_compute/core/Types.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/runtime/NEON/INEOperator.h" + +#include <memory> namespace arm_compute { class ITensor; class ITensorInfo; -namespace experimental -{ -/** Basic function to run @ref NEPixelWiseMultiplicationKernel */ -class NEPixelWiseMultiplication : public INEOperator -{ -public: - /** Initialise the kernel's inputs, output and convertion policy. - * - * Valid configurations (Input1,Input2) -> Output : - * - * Support: Broadcast? Scale=1/255? - * - (U8,U8) -> U8, S16 N Y - * - (U8,S16) -> S16 N Y - * - (S16,U8) -> S16 N Y - * - (S16,S16) -> S16 N Y - * - (S32,S32) -> S32 Y N - * - (F16,F16) -> F16 N Y - * - (F32,F32) -> F32 Y Y - * - (QASYMM8,QASYMM8) -> QASYMM8 Y Y - * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED Y Y - * - (QSYMM16,QSYMM16) -> QSYMM16, S32 N Y - * - * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. - * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. - * - * @param[in, out] input1 First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 - * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[in, out] input2 Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 - * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[out] output Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32 - * @param[in] scale Scale to apply after multiplication. - * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. - * If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255 - * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype - * @param[in] rounding_policy Rounding policy. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. - */ - void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication - * - * Valid configurations (Input1,Input2) -> Output : - * - * Support: Broadcast? Scale=1/255? - * - (U8,U8) -> U8, S16 N Y - * - (U8,S16) -> S16 N Y - * - (S16,U8) -> S16 N Y - * - (S16,S16) -> S16 N Y - * - (S32,S32) -> S32 Y N - * - (F16,F16) -> F16 N Y - * - (F32,F32) -> F32 Y Y - * - (QASYMM8,QASYMM8) -> QASYMM8 Y Y - * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED Y Y - * - (QSYMM16,QSYMM16) -> QSYMM16, S32 N Y - * - * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. - * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. - * - * @param[in] input1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 - * @param[in] input2 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 - * @param[in] output Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32 - * @param[in] scale Scale to apply after multiplication. - * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. - * If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255 - * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype - * @param[in] rounding_policy Rounding policy. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. - * - * @return a status - */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); -}; - -/** Basic function to run @ref NEComplexPixelWiseMultiplicationKernel. */ -class NEComplexPixelWiseMultiplication : public INEOperator -{ -public: - /** Initialise the kernel's inputs, output. - * - * @param[in, out] input1 An input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor). - * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[in, out] input2 An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1. - * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[out] output The output tensor. Data types supported: same as @p input1. Number of channels: same as @p input1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. - */ - void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref NEComplexPixelWiseMultiplication - * - * @param[in] input1 An input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor). - * @param[in] input2 An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1. - * @param[in] output The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. - */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); -}; -} // namespace experimental - -/** Basic function to run @ref NEPixelWiseMultiplicationKernel */ +/** Basic function to run @ref cpu::CpuMul */ class NEPixelWiseMultiplication : public IFunction { public: @@ -143,13 +47,31 @@ public: /** Prevent instances of this class from being copied (As this class contains pointers) */ NEPixelWiseMultiplication(const NEPixelWiseMultiplication &) = delete; /** Default move constructor */ - NEPixelWiseMultiplication(NEPixelWiseMultiplication &&); + NEPixelWiseMultiplication(NEPixelWiseMultiplication &&) = default; /** Prevent instances of this class from being copied (As this class contains pointers) */ NEPixelWiseMultiplication &operator=(const NEPixelWiseMultiplication &) = delete; /** Default move assignment operator */ - NEPixelWiseMultiplication &operator=(NEPixelWiseMultiplication &&); + NEPixelWiseMultiplication &operator=(NEPixelWiseMultiplication &&) = default; /** Initialise the kernel's inputs, output and convertion policy. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |dst | + * |:--------------|:--------------|:--------------| + * |QASYMM8 |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |QSYMM16 |QSYMM16 |QASYMM16 | + * |QSYMM16 |QSYMM16 |S32 | + * |U8 |U8 |U8 | + * |U8 |U8 |S16 | + * |U8 |S16 |S16 | + * |S16 |U8 |S16 | + * |S16 |S16 |S16 | + * |F16 |F16 |F16 | + * |F32 |S32 |F32 | + * * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. * @@ -173,7 +95,12 @@ public: * @param[in] rounding_policy Rounding policy. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, + void configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication * @@ -200,7 +127,12 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: @@ -211,7 +143,7 @@ private: std::unique_ptr<Impl> _impl; }; -/** Basic function to run @ref NEComplexPixelWiseMultiplicationKernel. */ +/** Basic function to run @ref cpu::CpuComplexMul. */ class NEComplexPixelWiseMultiplication : public IFunction { public: @@ -222,11 +154,11 @@ public: /** Prevent instances of this class from being copied (As this class contains pointers) */ NEComplexPixelWiseMultiplication(const NEComplexPixelWiseMultiplication &) = delete; /** Default move constructor */ - NEComplexPixelWiseMultiplication(NEComplexPixelWiseMultiplication &&); + NEComplexPixelWiseMultiplication(NEComplexPixelWiseMultiplication &&) = default; /** Prevent instances of this class from being copied (As this class contains pointers) */ NEComplexPixelWiseMultiplication &operator=(const NEComplexPixelWiseMultiplication &) = delete; /** Default move assignment operator */ - NEComplexPixelWiseMultiplication &operator=(NEComplexPixelWiseMultiplication &&); + NEComplexPixelWiseMultiplication &operator=(NEComplexPixelWiseMultiplication &&) = default; /** Initialise the kernel's inputs, output. * * @param[in, out] input1 An input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor). @@ -236,7 +168,10 @@ public: * @param[out] output The output tensor. Data types supported: same as @p input1. Number of channels: same as @p input1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEComplexPixelWiseMultiplication * * @param[in] input1 An input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor). @@ -244,7 +179,10 @@ public: * @param[in] output The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run() override; @@ -253,5 +191,5 @@ private: struct Impl; std::unique_ptr<Impl> _impl; }; -} +} // namespace arm_compute #endif /*ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H */ diff --git a/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h b/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h new file mode 100644 index 0000000000..09251f2a5f --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_NEPOOLING3DLAYER_H +#define ARM_COMPUTE_NEPOOLING3DLAYER_H + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" + +#include <memory> + +namespace arm_compute +{ +// Forward declarations +class ITensor; +class ITensorInfo; +class IMemoryManager; +/** Basic function to simulate a pooling 3d layer with the specified pooling operation. This function calls the following kernels: + * + * -# @ref cpu::CpuPool3d + */ +class NEPooling3dLayer : public IFunction +{ +public: + /** Constructor */ + NEPooling3dLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEPooling3dLayer(const NEPooling3dLayer &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEPooling3dLayer &operator=(const NEPooling3dLayer &) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + NEPooling3dLayer(NEPooling3dLayer &&) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + NEPooling3dLayer &operator=(NEPooling3dLayer &&) = delete; + /** Default destructor */ + ~NEPooling3dLayer(); + /** Set the input and output tensors. + * + * Valid data layouts: + * - NDHWC + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |F16 |F16 | + * |F32 |F32 | + * |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED | + * + * @note Source tensor is padded with -inf for MAX pooling and 0 otherwise + * + * @param[in] input Source tensor. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED. + * @param[out] output Destination tensor. + * @param[in] pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo. + */ + void configure(const ITensor *input, ITensor *output, const Pooling3dLayerInfo &pool_info); + /** Static function to check if given info will lead to a valid configuration of @ref NEPooling3dLayer + * + * + * @param[in] input Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED. + * @param[in] output Destination tensor info. + * @param[in] pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr<Impl> _impl; +}; +} // namespace arm_compute +#endif /* ARM_COMPUTE_NEPOOLING3DLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h index e374348f98..768ad0d818 100644 --- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h +++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h @@ -27,6 +27,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" #include <memory> @@ -36,11 +37,9 @@ namespace arm_compute class ITensor; class ITensorInfo; -/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following Neon kernels: +/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels: * - * -# @ref NEFillBorderKernel (executed if padding size is different from zero) - * -# @ref cpu::kernels::CpuPoolingKernel - * -# @ref cpu::CpuPoolingAssemblyDispatch + * -# @ref cpu::CpuPool2d */ class NEPoolingLayer : public IFunction { @@ -59,7 +58,21 @@ public: ~NEPoolingLayer(); /** Set the input and output tensors. * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |F16 |F16 | + * |F32 |F32 | + * * @note F16 is supported for pool sizes 2 and 3 only + * @note Source tensor is padded with -inf for MAX pooling and 0 otherwise + * Cases where pooling region is completely outside input tensor are only supported for floating point data type * * @param[in, out] input Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. * @param[out] output Destination tensor. Data types supported: Same as @p input. @@ -78,7 +91,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices = nullptr); // Inherited methods overridden: void run() override; @@ -87,5 +103,5 @@ private: struct Impl; std::unique_ptr<Impl> _impl; }; -} +} // namespace arm_compute #endif /* ARM_COMPUTE_NEPOOLINGLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h b/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h index 3cc79fa28e..858e3299af 100644 --- a/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h +++ b/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -38,6 +38,15 @@ class NEPriorBoxLayer : public INESimpleFunctionNoBorder public: /** Set the input and output tensors. * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |dst | + * |:--------|:--------|:--------| + * |F32 |F32 |F32 | + * * @param[in] input1 First source tensor. Data types supported: F32. Data layouts supported: NCHW/NHWC. * @param[in] input2 Second source tensor. Data types and layouts supported: same as @p input1 * @param[out] output Destination tensor. Output dimensions are [W * H * num_priors * 4, 2]. Data type supported: same as @p input @@ -53,7 +62,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info); }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEPRIORBOXLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h index 954aceba1a..009a4e0911 100644 --- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h +++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,15 +25,17 @@ #define ARM_COMPUTE_NEQLSTMLAYER_H #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/common/LSTMParams.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" #include "arm_compute/runtime/NEON/functions/NECopy.h" +#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h" +#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h" #include "arm_compute/runtime/NEON/functions/NETranspose.h" -#include "arm_compute/runtime/common/LSTMParams.h" #include <memory> @@ -43,19 +45,24 @@ namespace arm_compute class ITensor; class ITensorInfo; class NEQLSTMLayerNormalizationKernel; -class NEGEMMLowpMatrixAReductionKernel; - +namespace cpu +{ +namespace kernels +{ +class CpuGemmLowpMatrixAReductionKernel; +} // namespace kernels +} // namespace cpu /** Basic function to run @ref NEQLSTMLayer * - * This function calls the following Neon functions/kernels: + * This function calls the following kernels: * * -# @ref NEActivationLayer Activation functions (tanh and logistic) * -# @ref NEArithmeticAddition Elementwise addition * -# @ref NEArithmeticSubtraction Elementwise subtraction * -# @ref NECopy Copy kernel for copying output_state_out to output * -# @ref NEGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers - * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16 - * -# @ref NEGEMMLowpMatrixAReductionKernel For precomputing effective biases to use + * -# @ref NEGEMMLowpOutputStage Convert 32-bit integers into QSYMM16 + * -# @ref cpu::kernels::CpuGemmLowpMatrixAReductionKernel For precomputing effective biases to use * -# @ref NEPixelWiseMultiplication Elementwise multiplication * -# @ref NETranspose Transpose function for reshaping the weights * */ @@ -76,6 +83,14 @@ public: ~NEQLSTMLayer(); /** Initialize function's tensors. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 - src6 |src7 -src9 |src10 |src11 |dst0 |dst1 - dst2 | + * |:-------------|:------------|:------------|:------|:-------------|:------|:-----------------| + * |QASYMM8_SIGNED|QASYMM8 |S32 |QSYMM16|QASYMM8_SIGNED|QSYMM16|QASYMM8_SIGNED | + * * @param[in] input Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8_SIGNED. * @param[in] input_to_forget_weights 2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8. * @param[in] input_to_cell_weights 2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8. @@ -115,12 +130,21 @@ public: * projection_threshold (Optional) The clipping threshold for the output from the projection layer, such that values are bound within * [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled. */ - void configure(const ITensor *input, - const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights, - const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights, - const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias, - const ITensor *cell_state_in, ITensor *output_state_in, - ITensor *cell_state_out, ITensor *output_state_out, ITensor *output, + void configure(const ITensor *input, + const ITensor *input_to_forget_weights, + const ITensor *input_to_cell_weights, + const ITensor *input_to_output_weights, + const ITensor *recurrent_to_forget_weights, + const ITensor *recurrent_to_cell_weights, + const ITensor *recurrent_to_output_weights, + const ITensor *forget_gate_bias, + const ITensor *cell_bias, + const ITensor *output_gate_bias, + const ITensor *cell_state_in, + ITensor *output_state_in, + ITensor *cell_state_out, + ITensor *output_state_out, + ITensor *output, const LSTMParams<ITensor> &lstm_params); /** Static function to check if given info will lead to a valid configuration of @ref NEQLSTMLayer @@ -165,12 +189,21 @@ public: * [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled. * @return a status */ - static Status validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output, + static Status validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out, + const ITensorInfo *output, const LSTMParams<ITensorInfo> &lstm_params); // Inherited methods overridden: @@ -203,10 +236,17 @@ private: * @param[in] mm_res_info Tensor info to be used to initialize output stage result tensor. * */ - void configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info, - const ITensor *mm_input, const ITensor *mm_weights, const ITensor *bias, Tensor *mm_res, - Tensor *outstage_res, float gemmlowp_scale, - const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info); + void configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, + NEGEMMLowpOutputStage &outstage, + GEMMLowpOutputStageInfo &gemmlowp_info, + const ITensor *mm_input, + const ITensor *mm_weights, + const ITensor *bias, + Tensor *mm_res, + Tensor *outstage_res, + float gemmlowp_scale, + const TensorInfo &mm_res_info, + const TensorInfo &outstage_tensor_info); MemoryGroup _memory_group; @@ -215,8 +255,8 @@ private: { static constexpr uint32_t max_dimension_supported = 2; - ITensor *_src{ nullptr }; - ITensor *_dst{ nullptr }; + ITensor *_src{nullptr}; + ITensor *_dst{nullptr}; size_t _row_size{}; Window _window{}; @@ -242,70 +282,73 @@ private: }; // Functions used - NETranspose _transpose_input_to_forget_weights; - NETranspose _transpose_input_to_cell_weights; - NETranspose _transpose_input_to_output_weights; - NETranspose _transpose_input_to_input_weights; - NETranspose _transpose_recurrent_to_forget_weights; - NETranspose _transpose_recurrent_to_cell_weights; - NETranspose _transpose_recurrent_to_output_weights; - NETranspose _transpose_recurrent_to_input_weights; - NETranspose _transpose_projection_weights; - std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_input_reduction; - std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_input_reduction; - std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_forget_reduction; - std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_forget_reduction; - std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_cell_reduction; - std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_cell_reduction; - std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_output_reduction; - std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_output_reduction; - std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _projection_reduction; - NEArithmeticAddition _projection_bias_add; - NEGEMMLowpMatrixMultiplyCore _mm_input_to_forget; - NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget; - NEPixelWiseMultiplication _pixelwise_mul_cell_to_forget; - NEGEMMLowpOutputStage _input_to_forget_outstage; - NEGEMMLowpOutputStage _recurrent_to_forget_outstage; - NEGEMMLowpOutputStage _cell_to_forget_outstage; - NEArithmeticAddition _accumulate_input_recurrent_forget; - NEArithmeticAddition _accumulate_cell_forget; - NEActivationLayer _forget_gate_sigmoid; - NEGEMMLowpMatrixMultiplyCore _mm_input_to_cell; - NEGEMMLowpOutputStage _input_to_cell_outstage; - NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell; - NEGEMMLowpOutputStage _recurrent_to_cell_outstage; - NEArithmeticAddition _accumulate_input_recurrent_modulation; - NEActivationLayer _cell_gate_tanh; - NEArithmeticSubtraction _input_gate_sub; - NEGEMMLowpMatrixMultiplyCore _mm_input_to_input; - NEGEMMLowpOutputStage _input_to_input_outstage; - NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input; - NEGEMMLowpOutputStage _recurrent_to_input_outstage; - NEArithmeticAddition _accumulate_input_recurrent_input; - NEPixelWiseMultiplication _pixelwise_mul_cell_to_input; - NEGEMMLowpOutputStage _cell_to_input_outstage; - NEArithmeticAddition _accumulate_cell_input; - NEActivationLayer _input_gate_sigmoid; - NEPixelWiseMultiplication _pixelwise_mul_forget_cell; - NEPixelWiseMultiplication _pixelwise_mul_input_cell; - NEArithmeticAddition _add_forget_cell; - NEActivationLayer _cell_clip; - NEGEMMLowpMatrixMultiplyCore _mm_input_to_output; - NEGEMMLowpOutputStage _input_to_output_outstage; - NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output; - NEGEMMLowpOutputStage _recurrent_to_output_outstage; - NEArithmeticAddition _accumulate_input_recurrent_output; - NEPixelWiseMultiplication _pixelwise_mul_cell_to_output; - NEGEMMLowpOutputStage _cell_to_output_outstage; - NEArithmeticAddition _accumulate_cell_to_output; - NEActivationLayer _output_gate_sigmoid; - NEActivationLayer _hidden_tanh; - NEPixelWiseMultiplication _pixelwise_mul_hidden; - NEGEMMLowpOutputStage _hidden_outstage; - NEGEMMLowpMatrixMultiplyCore _mm_projection; - NEGEMMLowpOutputStage _projection_outstage; - NEArithmeticAddition _accumulate_projection; - NEActivationLayer _projection_clip; + + NEDequantizationLayer _dequantize_input_to_forget_weights; + NEQuantizationLayer _quantize_input_to_forget_weights; + NETranspose _transpose_input_to_forget_weights; + NETranspose _transpose_input_to_cell_weights; + NETranspose _transpose_input_to_output_weights; + NETranspose _transpose_input_to_input_weights; + NETranspose _transpose_recurrent_to_forget_weights; + NETranspose _transpose_recurrent_to_cell_weights; + NETranspose _transpose_recurrent_to_output_weights; + NETranspose _transpose_recurrent_to_input_weights; + NETranspose _transpose_projection_weights; + std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_input_reduction; + std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_input_reduction; + std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_forget_reduction; + std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_forget_reduction; + std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_cell_reduction; + std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_cell_reduction; + std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_output_reduction; + std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_output_reduction; + std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _projection_reduction; + NEArithmeticAddition _projection_bias_add; + NEGEMMLowpMatrixMultiplyCore _mm_input_to_forget; + NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget; + NEPixelWiseMultiplication _pixelwise_mul_cell_to_forget; + NEGEMMLowpOutputStage _input_to_forget_outstage; + NEGEMMLowpOutputStage _recurrent_to_forget_outstage; + NEGEMMLowpOutputStage _cell_to_forget_outstage; + NEArithmeticAddition _accumulate_input_recurrent_forget; + NEArithmeticAddition _accumulate_cell_forget; + NEActivationLayer _forget_gate_sigmoid; + NEGEMMLowpMatrixMultiplyCore _mm_input_to_cell; + NEGEMMLowpOutputStage _input_to_cell_outstage; + NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell; + NEGEMMLowpOutputStage _recurrent_to_cell_outstage; + NEArithmeticAddition _accumulate_input_recurrent_modulation; + NEActivationLayer _cell_gate_tanh; + NEArithmeticSubtraction _input_gate_sub; + NEGEMMLowpMatrixMultiplyCore _mm_input_to_input; + NEGEMMLowpOutputStage _input_to_input_outstage; + NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input; + NEGEMMLowpOutputStage _recurrent_to_input_outstage; + NEArithmeticAddition _accumulate_input_recurrent_input; + NEPixelWiseMultiplication _pixelwise_mul_cell_to_input; + NEGEMMLowpOutputStage _cell_to_input_outstage; + NEArithmeticAddition _accumulate_cell_input; + NEActivationLayer _input_gate_sigmoid; + NEPixelWiseMultiplication _pixelwise_mul_forget_cell; + NEPixelWiseMultiplication _pixelwise_mul_input_cell; + NEArithmeticAddition _add_forget_cell; + NEActivationLayer _cell_clip; + NEGEMMLowpMatrixMultiplyCore _mm_input_to_output; + NEGEMMLowpOutputStage _input_to_output_outstage; + NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output; + NEGEMMLowpOutputStage _recurrent_to_output_outstage; + NEArithmeticAddition _accumulate_input_recurrent_output; + NEPixelWiseMultiplication _pixelwise_mul_cell_to_output; + NEGEMMLowpOutputStage _cell_to_output_outstage; + NEArithmeticAddition _accumulate_cell_to_output; + NEActivationLayer _output_gate_sigmoid; + NEActivationLayer _hidden_tanh; + NEPixelWiseMultiplication _pixelwise_mul_hidden; + NEGEMMLowpOutputStage _hidden_outstage; + NEGEMMLowpMatrixMultiplyCore _mm_projection; + NEGEMMLowpOutputStage _projection_outstage; + NEArithmeticAddition _accumulate_projection; + NEActivationLayer _projection_clip; TensorCopyKernel _projection_bias_copy; TensorCopyKernel _projection_output_to_accumulate_copy; @@ -317,19 +360,16 @@ private: NECopy _copy_output; // Tensor pointers - const ITensor *_input_to_input_weights - { - nullptr - }; - const ITensor *_recurrent_to_input_weights{ nullptr }; - const ITensor *_projection_bias{ nullptr }; - const ITensor *_input_to_forget_weights{ nullptr }; - const ITensor *_input_to_cell_weights{ nullptr }; - const ITensor *_input_to_output_weights{ nullptr }; - const ITensor *_recurrent_to_forget_weights{ nullptr }; - const ITensor *_recurrent_to_cell_weights{ nullptr }; - const ITensor *_recurrent_to_output_weights{ nullptr }; - const ITensor *_projection_weights{ nullptr }; + const ITensor *_input_to_input_weights{nullptr}; + const ITensor *_recurrent_to_input_weights{nullptr}; + const ITensor *_projection_bias{nullptr}; + const ITensor *_input_to_forget_weights{nullptr}; + const ITensor *_input_to_cell_weights{nullptr}; + const ITensor *_input_to_output_weights{nullptr}; + const ITensor *_recurrent_to_forget_weights{nullptr}; + const ITensor *_recurrent_to_cell_weights{nullptr}; + const ITensor *_recurrent_to_output_weights{nullptr}; + const ITensor *_projection_weights{nullptr}; std::array<const ITensor *, _layer_norm_count> _layer_norm_weights{}; std::array<const ITensor *, _layer_norm_count> _layer_norm_bias{}; @@ -364,63 +404,66 @@ private: return _layer_norms[getGateIndex(g)]; } - void configure_layer_norm(LayerNormGate g, const ITensor *in); + void configure_layer_norm(LayerNormGate g, const ITensor *in); static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias); // Temporary tensors - Tensor _input_to_forget_weights_transposed{ nullptr }; - Tensor _input_to_cell_weights_transposed{ nullptr }; - Tensor _input_to_output_weights_transposed{ nullptr }; - Tensor _input_to_input_weights_transposed{ nullptr }; - Tensor _recurrent_to_forget_weights_transposed{ nullptr }; - Tensor _recurrent_to_cell_weights_transposed{ nullptr }; - Tensor _recurrent_to_output_weights_transposed{ nullptr }; - Tensor _recurrent_to_input_weights_transposed{ nullptr }; - Tensor _projection_weights_transposed{ nullptr }; - Tensor _input_to_input_eff_bias{ nullptr }; - Tensor _recurrent_to_input_eff_bias{ nullptr }; - Tensor _input_to_forget_eff_bias{ nullptr }; - Tensor _recurrent_to_forget_eff_bias{ nullptr }; - Tensor _input_to_cell_eff_bias{ nullptr }; - Tensor _recurrent_to_cell_eff_bias{ nullptr }; - Tensor _input_to_output_eff_bias{ nullptr }; - Tensor _recurrent_to_output_eff_bias{ nullptr }; - Tensor _projection_reduction_res{ nullptr }; - Tensor _projection_eff_bias{ nullptr }; - Tensor _mm_input_to_forget_res{ nullptr }; - Tensor _mm_recurrent_to_forget_res{ nullptr }; - Tensor _mul_cell_to_forget_res{ nullptr }; - Tensor _input_to_forget_outstage_res{ nullptr }; - Tensor _cell_to_forget_outstage_res{ nullptr }; - Tensor _recurrent_to_forget_outstage_res{ nullptr }; - Tensor _forget_gate{ nullptr }; - Tensor _mm_input_to_cell_res{ nullptr }; - Tensor _input_to_cell_outstage_res{ nullptr }; - Tensor _mm_recurrent_to_cell_res{ nullptr }; - Tensor _recurrent_to_cell_outstage_res{ nullptr }; - Tensor _cell_gate{ nullptr }; - Tensor _mul_input_cell_res{ nullptr }; - Tensor _mm_input_to_input_res{ nullptr }; - Tensor _input_to_input_outstage_res{ nullptr }; - Tensor _mm_recurrent_to_input_res{ nullptr }; - Tensor _mul_cell_to_input_res{ nullptr }; - Tensor _cell_to_input_outstage_res{ nullptr }; - Tensor _recurrent_to_input_outstage_res{ nullptr }; - Tensor _input_gate{ nullptr }; - Tensor _mm_input_to_output_res{ nullptr }; - Tensor _input_to_output_outstage_res{ nullptr }; - Tensor _mm_recurrent_to_output_res{ nullptr }; - Tensor _mul_cell_to_output_res{ nullptr }; - Tensor _cell_to_output_outstage_res{ nullptr }; - Tensor _recurrent_to_output_outstage_res{ nullptr }; - Tensor _output_gate{ nullptr }; - Tensor _hidden_mul_res{ nullptr }; - Tensor _hidden_gate{ nullptr }; - Tensor _mm_projection_res{ nullptr }; - Tensor _projection_outstage_res{ nullptr }; - Tensor _projection_out_res{ nullptr }; - Tensor _projection_accumulate_res{ nullptr }; - Tensor _ones{ nullptr }; + Tensor _input_to_forget_weights_f32{nullptr}; + Tensor _input_to_forget_weights_symm8{nullptr}; + + Tensor _input_to_forget_weights_transposed{nullptr}; + Tensor _input_to_cell_weights_transposed{nullptr}; + Tensor _input_to_output_weights_transposed{nullptr}; + Tensor _input_to_input_weights_transposed{nullptr}; + Tensor _recurrent_to_forget_weights_transposed{nullptr}; + Tensor _recurrent_to_cell_weights_transposed{nullptr}; + Tensor _recurrent_to_output_weights_transposed{nullptr}; + Tensor _recurrent_to_input_weights_transposed{nullptr}; + Tensor _projection_weights_transposed{nullptr}; + Tensor _input_to_input_eff_bias{nullptr}; + Tensor _recurrent_to_input_eff_bias{nullptr}; + Tensor _input_to_forget_eff_bias{nullptr}; + Tensor _recurrent_to_forget_eff_bias{nullptr}; + Tensor _input_to_cell_eff_bias{nullptr}; + Tensor _recurrent_to_cell_eff_bias{nullptr}; + Tensor _input_to_output_eff_bias{nullptr}; + Tensor _recurrent_to_output_eff_bias{nullptr}; + Tensor _projection_reduction_res{nullptr}; + Tensor _projection_eff_bias{nullptr}; + Tensor _mm_input_to_forget_res{nullptr}; + Tensor _mm_recurrent_to_forget_res{nullptr}; + Tensor _mul_cell_to_forget_res{nullptr}; + Tensor _input_to_forget_outstage_res{nullptr}; + Tensor _cell_to_forget_outstage_res{nullptr}; + Tensor _recurrent_to_forget_outstage_res{nullptr}; + Tensor _forget_gate{nullptr}; + Tensor _mm_input_to_cell_res{nullptr}; + Tensor _input_to_cell_outstage_res{nullptr}; + Tensor _mm_recurrent_to_cell_res{nullptr}; + Tensor _recurrent_to_cell_outstage_res{nullptr}; + Tensor _cell_gate{nullptr}; + Tensor _mul_input_cell_res{nullptr}; + Tensor _mm_input_to_input_res{nullptr}; + Tensor _input_to_input_outstage_res{nullptr}; + Tensor _mm_recurrent_to_input_res{nullptr}; + Tensor _mul_cell_to_input_res{nullptr}; + Tensor _cell_to_input_outstage_res{nullptr}; + Tensor _recurrent_to_input_outstage_res{nullptr}; + Tensor _input_gate{nullptr}; + Tensor _mm_input_to_output_res{nullptr}; + Tensor _input_to_output_outstage_res{nullptr}; + Tensor _mm_recurrent_to_output_res{nullptr}; + Tensor _mul_cell_to_output_res{nullptr}; + Tensor _cell_to_output_outstage_res{nullptr}; + Tensor _recurrent_to_output_outstage_res{nullptr}; + Tensor _output_gate{nullptr}; + Tensor _hidden_mul_res{nullptr}; + Tensor _hidden_gate{nullptr}; + Tensor _mm_projection_res{nullptr}; + Tensor _projection_outstage_res{nullptr}; + Tensor _projection_out_res{nullptr}; + Tensor _projection_accumulate_res{nullptr}; + Tensor _ones{nullptr}; std::array<Tensor, _layer_norm_count> _layer_norm_output{}; inline Tensor &get_layer_norm_output(LayerNormGate g) @@ -428,14 +471,15 @@ private: return _layer_norm_output[getGateIndex(g)]; } - bool _is_prepared{ false }; - bool _has_cifg{ false }; - bool _has_cell_clipping{ false }; - bool _has_projection{ false }; - bool _has_projection_clipping{ false }; - bool _has_peephole{ false }; - bool _has_layer_norm{ false }; - bool _projection_tensor_copy_required{ false }; + bool _is_prepared{false}; + bool _has_cifg{false}; + bool _has_cell_clipping{false}; + bool _has_projection{false}; + bool _has_projection_clipping{false}; + bool _has_peephole{false}; + bool _has_layer_norm{false}; + bool _projection_tensor_copy_required{false}; + bool _convert_input_to_forget_weights_to_qsymm8{false}; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEQLSTMLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h index 8b0532beea..7bf97e28a5 100644 --- a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h +++ b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h @@ -24,28 +24,45 @@ #ifndef ARM_COMPUTE_NEQUANTIZATIONLAYER_H #define ARM_COMPUTE_NEQUANTIZATIONLAYER_H +#include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IRuntimeContext.h" -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" - -#include "arm_compute/core/Types.h" +#include <memory> namespace arm_compute { class ITensor; class ITensorInfo; -/** Basic function to simulate a quantization layer. This function calls the following Neon kernels: - * - * - * -# @ref NEQuantizationLayerKernel - * - */ -class NEQuantizationLayer : public INESimpleFunctionNoBorder +/** Basic function to run a quantization layer using @ref cpu::CpuQuantize */ +class NEQuantizationLayer : public IFunction { public: + NEQuantizationLayer(); + /** Default Destructor */ + ~NEQuantizationLayer(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEQuantizationLayer(const NEQuantizationLayer &) = delete; + /** Default move constructor */ + NEQuantizationLayer(NEQuantizationLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEQuantizationLayer &operator=(const NEQuantizationLayer &) = delete; + /** Default move assignment operator */ + NEQuantizationLayer &operator=(NEQuantizationLayer &&) = default; /** Set the input and output tensors. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:------------------|:--------------------------------------| + * |QASYMM8 |QASYMM8, QASYMM8_SIGNED, QASYMM16 | + * |QASYMM8_SIGNED |QASYMM8, QASYMM8_SIGNED, QASYMM16 | + * |F16 |QASYMM8, QASYMM8_SIGNED, QASYMM16 | + * |F32 |QASYMM8, QASYMM8_SIGNED, QASYMM16 | + * * @param[in] input Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16. * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16 */ @@ -58,6 +75,13 @@ public: * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *output); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEQUANTIZATIONLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NERNNLayer.h b/arm_compute/runtime/NEON/functions/NERNNLayer.h index 66f7f2ea3f..af7f464ac9 100644 --- a/arm_compute/runtime/NEON/functions/NERNNLayer.h +++ b/arm_compute/runtime/NEON/functions/NERNNLayer.h @@ -54,6 +54,16 @@ public: ~NERNNLayer(); /** Initialize the function * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |src3 |dst0 |dst1 | + * |:------|:------|:------|:------|:------|:------| + * |F16 |F16 |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 |F32 |F32 | + * * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data types supported: F16/F32 * @param[in] weights Weights tensor of shape [input_size, num_units] that multiplies the input. Data types supported: Same as @p input * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the current 'state'. Data types supported: Same as @p input @@ -62,7 +72,13 @@ public: * @param[in,out] hidden_state Output tensor of shape [num_units, batch_size]. Data types supported: Same as @p input * @param[in] info Activation layer parameter. */ - void configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, const ITensor *bias, ITensor *hidden_state, ITensor *output, ActivationLayerInfo &info); + void configure(const ITensor *input, + const ITensor *weights, + const ITensor *recurrent_weights, + const ITensor *bias, + ITensor *hidden_state, + ITensor *output, + ActivationLayerInfo &info); /** Initialize the function * * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data types supported: F16/F32 @@ -75,7 +91,12 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state, const ITensorInfo *output, + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *recurrent_weights, + const ITensorInfo *bias, + const ITensorInfo *hidden_state, + const ITensorInfo *output, const ActivationLayerInfo &info); // Inherited methods overridden: diff --git a/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h b/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h index 9d934588fb..b06ebe899d 100644 --- a/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h +++ b/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h @@ -32,21 +32,27 @@ namespace arm_compute class ITensor; class ITensorInfo; -/** Basic function to run @ref NEROIAlignLayerKernel. - * - * This function calls the following Neon kernels: - * -# @ref NEROIAlignLayerKernel - * - */ +/** Basic function to run @ref NEROIAlignLayerKernel. */ class NEROIAlignLayer : public INESimpleFunctionNoBorder { public: /** Set the input and output tensors. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |dst | + * |:--------------|:--------------|:--------------| + * |F16 |F16 |F16 | + * |F32 |F32 |F32 | + * |QASYMM8 |QASYMM16 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM16 |QASYMM8_SIGNED | + * * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. * @param[in] rois ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner * as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. - * Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8, otherwise same as @p input + * Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8/QASYMM8_SIGNED, otherwise same as @p input * @param[out] output Destination tensor. Data types supported: Same as @p input. * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo. * @@ -59,7 +65,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref NEROIAlignLayerKernel * * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] rois ROIs tensor info. Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8, + * @param[in] rois ROIs tensor info. Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8/QASYMM8_SIGNED, * otherwise same as @p input * @param[in] output Destination tensor info. Data types supported: Same as @p input. * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo. @@ -71,7 +77,10 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info); }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEROIALIGNLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h index a912669f57..929111ad4b 100644 --- a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h +++ b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h @@ -24,23 +24,19 @@ #ifndef ARM_COMPUTE_NEROIPOOLINGLAYER_H #define ARM_COMPUTE_NEROIPOOLINGLAYER_H +#include "arm_compute/core/IArray.h" #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/IArray.h" #include <memory> namespace arm_compute { class ITensor; +class ITensorInfo; class NEROIPoolingLayerKernel; class ROIPoolingLayerInfo; -/** Basic function to run @ref NEROIPoolingLayerKernel. - * - * This function calls the following Neon kernels: - * -# @ref NEROIPoolingLayerKernel - * - */ +/** Basic function to run @ref NEROIPoolingLayerKernel. */ class NEROIPoolingLayer : public IFunction { public: @@ -58,7 +54,16 @@ public: ~NEROIPoolingLayer(); /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: F32. + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |dst | + * |:--------------|:--------------|:--------------| + * |F32 |U16 |F32 | + * |QASYMM8 |U16 |QASYMM8 | + * + * @param[in] input Source tensor. Data types supported: QASYMM8/F32 * @param[in] rois ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner * as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16 * @param[out] output Destination tensor. Data types supported: Same as @p input. @@ -69,11 +74,30 @@ public: * @note The z dimensions of @p output tensor and @p input tensor must be the same. * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. */ - void configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info); + void + configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info); // Inherited methods overridden: void run() override; + /** Static function to check if given info will lead to a valid configuration of @ref NEROIPoolingLayerKernel + * + * @param[in] input Source tensor info. Data types supported: QASYMM8/F32. + * @param[in] rois TensorInfo for rois tensor which is a 2D tensor of size [5,N] (where 5 is the number ROIs). Data types supported: U16 + * @param[in] output Destination tensor info. Data types supported: Same as @p input. + * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo. + * + * @note The x and y dimensions of @p output tensor must be the same as that specified by @p pool_info 's pooled + * width and pooled height. + * @note The z dimensions of @p output tensor and @p input tensor must be the same. + * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. + * @return a Status + */ + static Status validate(const ITensorInfo *input, + const ITensorInfo *rois, + const ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info); + private: std::unique_ptr<NEROIPoolingLayerKernel> _roi_kernel; }; diff --git a/arm_compute/runtime/NEON/functions/NERange.h b/arm_compute/runtime/NEON/functions/NERange.h index 28976001d7..609456a4ef 100644 --- a/arm_compute/runtime/NEON/functions/NERange.h +++ b/arm_compute/runtime/NEON/functions/NERange.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,6 +26,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" + #include <memory> namespace arm_compute @@ -57,6 +58,21 @@ public: ~NERange(); /** Initialize the kernel's start, end, step and output tensor. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |dst | + * |:---------| + * |U8 | + * |S8 | + * |U16 | + * |S16 | + * |U32 | + * |S32 | + * |F16 | + * |F32 | + * * @param[out] output Output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. * @param[in] start The starting value of the sequence. * @param[in] end The ending (not including) value of the sequence. diff --git a/arm_compute/runtime/NEON/functions/NEReduceMean.h b/arm_compute/runtime/NEON/functions/NEReduceMean.h index 89cd09812b..5b8d8cdf2b 100644 --- a/arm_compute/runtime/NEON/functions/NEReduceMean.h +++ b/arm_compute/runtime/NEON/functions/NEReduceMean.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,12 +24,9 @@ #ifndef ARM_COMPUTE_NEON_REDUCE_MEAN_H #define ARM_COMPUTE_NEON_REDUCE_MEAN_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h" -#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h" #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" #include "arm_compute/runtime/Tensor.h" @@ -54,6 +51,17 @@ public: ~NEReduceMean(); /** Configure kernel * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |F16 |F16 | + * |F32 |F32 | + * * @note Supported tensor rank: up to 4 * * @param[in] input Source tensor. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32 @@ -72,7 +80,8 @@ public: * * @return A status */ - static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output); + static Status + validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output); // Inherited methods overridden: void run() override; @@ -82,13 +91,8 @@ private: std::vector<NEReductionOperation> _reduction_kernels; std::vector<Tensor> _reduced_outs; NEReshapeLayer _reshape; - NEDequantizationLayer _dequant; - NEQuantizationLayer _requant; int _reduction_ops; bool _keep_dims; - bool _do_requant; - Tensor _input_no_quant; - Tensor _output_no_quant; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEON_REDUCE_MEAN_H */ diff --git a/arm_compute/runtime/NEON/functions/NEReductionOperation.h b/arm_compute/runtime/NEON/functions/NEReductionOperation.h index f30cc810f1..f5391a6d0e 100644 --- a/arm_compute/runtime/NEON/functions/NEReductionOperation.h +++ b/arm_compute/runtime/NEON/functions/NEReductionOperation.h @@ -25,9 +25,9 @@ #define ARM_COMPUTE_NEREDUCTIONOPERATION_H #include "arm_compute/runtime/IFunction.h" - #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" #include "arm_compute/runtime/Tensor.h" + #include <memory> namespace arm_compute @@ -35,7 +35,7 @@ namespace arm_compute class ITensor; class NEReductionOperationKernel; -/** Basic function to simulate a reduction operation. This function calls the following Neon kernels: +/** Basic function to simulate a reduction operation. This function calls the following kernels: * * -# @ref NEReshapeLayer * -# @ref NEReductionOperationKernel @@ -58,7 +58,19 @@ public: ~NEReductionOperation(); /** Set the input and output tensors. * - * @param[in, out] input Source tensor. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32. Data layouts supported: NCHW. (Written to only for border_size != 0) + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |F16 |F16 | + * |F32 |F32 | + * |S32 |S32 | + * + * @param[in, out] input Source tensor. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32. (Written to only for border_size != 0) * @param[out] output Destination tensor. Data types and data layouts supported: same as @p input. * @param[in] axis Dimension along which to reduce. Supported reduction axis : 0 * @param[in] op Reduction operation to perform. @@ -68,7 +80,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref NEReductionOperation. * - * @param[in] input Source tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32. Data layouts supported: NCHW. + * @param[in] input Source tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32. * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p input. * @param[in] axis Dimension along which to reduce. Supported reduction axis : 0 * @param[in] op Reduction operation to perform. @@ -76,7 +88,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims = true); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + unsigned int axis, + ReductionOperation op, + bool keep_dims = true); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NERemap.h b/arm_compute/runtime/NEON/functions/NERemap.h deleted file mode 100644 index 84d0f2ee92..0000000000 --- a/arm_compute/runtime/NEON/functions/NERemap.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NEREMAP_H -#define ARM_COMPUTE_NEREMAP_H - -#include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/INESimpleFunction.h" -#include "arm_compute/runtime/Tensor.h" - -#include <cstdint> - -namespace arm_compute -{ -class ITensor; - -/** Basic function to execute remap. This function calls the following Neon kernels: - * - * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE) - * -# @ref NERemapKernel - * - * @deprecated This function is deprecated and is intended to be removed in 21.05 release - * - */ -class NERemap : public INESimpleFunction -{ -public: - /** Initialise the function's sources, destination, interpolation policy and border mode. - * - * @param[in, out] input Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED) - * @param[in] map_x Map for X coordinates. Data type supported: F32. - * @param[in] map_y Map for Y coordinates. Data type supported: F32. - * @param[out] output Output tensor. Data type supported: U8. - * @param[in] policy Interpolation policy to use. Only NEAREST and BILINEAR are supported. - * @param[in] border_mode Border mode to use on the input tensor. - * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. - * - */ - void configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, - InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0); -}; -} -#endif /*ARM_COMPUTE_NEREMAP_H */ diff --git a/arm_compute/runtime/NEON/functions/NEReorderLayer.h b/arm_compute/runtime/NEON/functions/NEReorderLayer.h new file mode 100644 index 0000000000..e3fa7b9c16 --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEReorderLayer.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__aarch64__) + +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREORDERLAYER +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREORDERLAYER + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" + +namespace arm_compute +{ +class ITensor; +class ITensorInfo; +class NEReorderKernel; +/** Function to compute blocked reorder. */ +class NEReorderLayer : public IFunction +{ +public: + /** Default constructor */ + NEReorderLayer(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEReorderLayer(const NEReorderLayer &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEReorderLayer &operator=(const NEReorderLayer &) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + NEReorderLayer(NEReorderLayer &&) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + NEReorderLayer &operator=(NEReorderLayer &&) = delete; + /** Default destructor */ + ~NEReorderLayer(); + /** Set the input and output tensors. + * + * Valid data layouts: + * - NCHW + * + * Valid data type configurations: + * |src |dst | + * |:--------|:---------| + * |F32 |F32 | + * + * @param[in] input Source tensor. Data type supported: F32. Data layouts supported: NCHW. + * @param[out] output Destination with the same dimensions, data type, data layout as @p input + * except last dimension of data layout which needs to be multiple of blocking parameter ksize + * @param[in] input_wf WeightFormat of input. + * @param[in] output_wf WeightFormat of output. + */ + void configure(const ITensor *input, + ITensor *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf); + + /** Static function to check if given info will lead to a valid configuration of @ref NEReorderLayer + * + * Similar to @ref NEReorderLayer::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf); + + // Inherited methods overridden: + void run() override; + +private: + std::unique_ptr<NEReorderKernel> _reorder_kernel; /**< Reorder layer kernel */ +}; +} // namespace arm_compute +#endif /* ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREORDERLAYER */ + +#endif // defined(__aarch64__) diff --git a/arm_compute/runtime/NEON/functions/NEReorgLayer.h b/arm_compute/runtime/NEON/functions/NEReorgLayer.h index f76d1d252c..0a7d824d10 100644 --- a/arm_compute/runtime/NEON/functions/NEReorgLayer.h +++ b/arm_compute/runtime/NEON/functions/NEReorgLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -39,6 +39,15 @@ class NEReorgLayer : public INESimpleFunctionNoBorder public: /** Initialise the kernel's inputs and outputs * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |All |All | + * * @param[in] input First tensor input. Data type supported: All * @param[out] output Output tensor. Data type supported: Same as @p input * @param[in] stride Stride to be used during data re-organization diff --git a/arm_compute/runtime/NEON/functions/NEReshapeLayer.h b/arm_compute/runtime/NEON/functions/NEReshapeLayer.h index b8c0a841bc..3e6e33f797 100644 --- a/arm_compute/runtime/NEON/functions/NEReshapeLayer.h +++ b/arm_compute/runtime/NEON/functions/NEReshapeLayer.h @@ -52,6 +52,14 @@ public: NEReshapeLayer &operator=(NEReshapeLayer &&); /** Initialise the kernel's inputs and outputs * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:------|:------| + * |All |All | + * * @param[in] input Input tensor. Data type supported: All * @param[out] output Output tensor. Data type supported: Same as @p input */ diff --git a/arm_compute/runtime/NEON/functions/NEReverse.h b/arm_compute/runtime/NEON/functions/NEReverse.h index 2048dafcb5..e03e415068 100644 --- a/arm_compute/runtime/NEON/functions/NEReverse.h +++ b/arm_compute/runtime/NEON/functions/NEReverse.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,12 +21,11 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NEREVERSE_H -#define ARM_COMPUTE_NEREVERSE_H - -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREVERSE_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREVERSE_H #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" namespace arm_compute { @@ -39,20 +38,39 @@ class NEReverse : public INESimpleFunctionNoBorder public: /** Initialize the function * - * @param[in] input Input tensor. Data types supported: All - * @param[out] output Output tensor. Data type supported: Same as @p input - * @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32 + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |dst | + * |:--------------|:--------------|:--------------| + * |All |U32, S32 |All | + * + * @param[in] input Input tensor. Data types supported: All + * @param[out] output Output tensor. Data type supported: Same as @p input + * @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32 + * @param[in] use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis + * + * @note The value of each axis should be between [-rank, rank) + * @note If there are duplicate values in the tensor, the subsequent axis values are ignored. e.g. an array of [2, 2] has the same effects as [2]. + * + * @deprecated Support for U32 in axis tensor will be removed in 24.02 release + * */ - void configure(const ITensor *input, ITensor *output, const ITensor *axis); + void configure(const ITensor *input, ITensor *output, const ITensor *axis, const bool use_inverted_axis = false); /** Static function to check if given info will lead to a valid configuration of @ref NEReverseKernel * - * @param[in] input Input tensor info. Data types supported: All - * @param[in] output Output tensor info. Data type supported: Same as @p input - * @param[in] axis Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32 + * @param[in] input Input tensor info. Data types supported: All + * @param[in] output Output tensor info. Data type supported: Same as @p input + * @param[in] axis Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32/S32 + * @param[in] use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *axis, + const bool use_inverted_axis = false); }; } // namespace arm_compute -#endif /* ARM_COMPUTE_NEREVERSE_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREVERSE_H diff --git a/arm_compute/runtime/NEON/functions/NEScale.h b/arm_compute/runtime/NEON/functions/NEScale.h index fceda83510..72dfa3bda4 100644 --- a/arm_compute/runtime/NEON/functions/NEScale.h +++ b/arm_compute/runtime/NEON/functions/NEScale.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,32 +26,58 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" -#include "arm_compute/runtime/Tensor.h" +#include "arm_compute/runtime/IFunction.h" + +#include <memory> namespace arm_compute { class ITensor; +class ITensorInfo; -/** Basic function to run @ref NEScaleKernel */ -class NEScale : public INESimpleFunctionNoBorder +/** Basic function to compute Scale */ +class NEScale : public IFunction { public: - /** Constructor - * - * Initialize NEScale - */ + /** Constructor */ NEScale(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEScale(const NEScale &) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + NEScale(NEScale &&) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEScale &operator=(const NEScale &) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + NEScale &operator=(NEScale &&) = delete; + /** Destructor */ + ~NEScale(); /** Initialize the function's source, destination, interpolation type and border_mode. * - * @param[in, out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED) + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |F16 |F16 | + * |F32 |F32 | + * |U8 |U8 | + * |S8 |S8 | + * |S16 |S16 | + * + * @param[in, out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED) * @param[out] output Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. * @param[in] info @ref ScaleKernelInfo to be used for configuration + * + * @note Using S8 data type only supports NHWC, @p border_mode Replicate, and @p policy Bilinear */ void configure(ITensor *input, ITensor *output, const ScaleKernelInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref NEScale * - * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED) + * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED) * @param[in] output Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. * @param[in] info @ref ScaleKernelInfo to be used for validation * @@ -59,10 +85,12 @@ public: */ static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info); + // Inherited methods overridden: + void run() override; + private: - Tensor _offsets; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */ - Tensor _dx; /**< Element's distance between the X real coordinate and the smallest X following integer */ - Tensor _dy; /**< Element's distance between the Y real coordinate and the smallest Y following integer */ + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute #endif /*ARM_COMPUTE_NESCALEIMAGE_H */ diff --git a/arm_compute/runtime/NEON/functions/NESelect.h b/arm_compute/runtime/NEON/functions/NESelect.h index c66fbfa7d4..c8e5a204dd 100644 --- a/arm_compute/runtime/NEON/functions/NESelect.h +++ b/arm_compute/runtime/NEON/functions/NESelect.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -39,6 +39,14 @@ class NESelect : public INESimpleFunctionNoBorder public: /** Initialise the kernel's inputs and output. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:--------------|:------|:--------------| + * |U8 |All |All |All | + * * @param[in] c Condition input tensor. Data types supported: U8. * @param[in] x First input tensor. Data types supported: All. * @param[in] y Second input tensor. Data types supported: Same as @p x diff --git a/arm_compute/runtime/NEON/functions/NESlice.h b/arm_compute/runtime/NEON/functions/NESlice.h index 28628778cb..70a688d3b0 100644 --- a/arm_compute/runtime/NEON/functions/NESlice.h +++ b/arm_compute/runtime/NEON/functions/NESlice.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -32,25 +32,44 @@ namespace arm_compute // Forward Declarations class ITensor; -namespace experimental -{ /** Basic function to perform tensor slicing */ -class NESlice : public INEOperator +class NESlice : public IFunction { public: + /** Default Constructor */ + NESlice(); + /** Default Destructor */ + ~NESlice(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NESlice(const NESlice &) = delete; + /** Default move constructor */ + NESlice(NESlice &&); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NESlice &operator=(const NESlice &) = delete; + /** Default move assignment operator */ + NESlice &operator=(NESlice &&); + /** Configure kernel * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:------|:------| + * |All |All | + * * @note Supported tensor rank: up to 4 * @note Start indices must be non-negative. 0 <= starts[i] * @note End coordinates can be negative, which represents the number of elements before the end of that dimension. * @note End indices are not inclusive unless negative. * - * @param[in] input Source tensor info. Data type supported: All - * @param[out] output Destination tensor info. Data type supported: Same as @p input + * @param[in] input Source tensor. Data type supported: All + * @param[out] output Destination tensor. Data type supported: Same as @p input * @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input). * @param[in] ends The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input). */ - void configure(const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends); + void configure(const ITensor *input, ITensor *output, const Coordinates &starts, const Coordinates &ends); /** Static function to check if given info will lead to a valid configuration of @ref NESlice * @@ -66,27 +85,23 @@ public: * * @return A status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr<Impl> _impl; }; -} // namespace experimental +namespace experimental +{ /** Basic function to perform tensor slicing */ -class NESlice : public IFunction +class NESlice : public INEOperator { public: - /** Default Constructor */ - NESlice(); - /** Default Destructor */ - ~NESlice(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NESlice(const NESlice &) = delete; - /** Default move constructor */ - NESlice(NESlice &&); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NESlice &operator=(const NESlice &) = delete; - /** Default move assignment operator */ - NESlice &operator=(NESlice &&); - /** Configure kernel * * @note Supported tensor rank: up to 4 @@ -94,12 +109,12 @@ public: * @note End coordinates can be negative, which represents the number of elements before the end of that dimension. * @note End indices are not inclusive unless negative. * - * @param[in] input Source tensor. Data type supported: All - * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] input Source tensor info. Data type supported: All + * @param[out] output Destination tensor info. Data type supported: Same as @p input * @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input). * @param[in] ends The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input). */ - void configure(const ITensor *input, ITensor *output, const Coordinates &starts, const Coordinates &ends); + void configure(const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends); /** Static function to check if given info will lead to a valid configuration of @ref NESlice * @@ -115,14 +130,9 @@ public: * * @return A status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends); - - // Inherited methods overridden: - void run() override; - -private: - struct Impl; - std::unique_ptr<Impl> _impl; + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends); }; +} // namespace experimental } // namespace arm_compute #endif /* ARM_COMPUTE_NE_SLICE_H */ diff --git a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h index 8a2ae10129..1787de6237 100644 --- a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h +++ b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h @@ -24,8 +24,10 @@ #ifndef ARM_COMPUTE_NESOFTMAXLAYER_H #define ARM_COMPUTE_NESOFTMAXLAYER_H +#include "arm_compute/core/Error.h" #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/IMemoryManager.h" + #include <memory> namespace arm_compute @@ -52,6 +54,17 @@ public: ~NESoftmaxLayerGeneric(); /** Set the input and output tensors. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |F16 |F16 | + * |F32 |F32 | + * * @param[in,out] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. If the width is not a * multiple of the internal processing block size, @ref NEFillBorder replicates the * last value of each row to the nearest multiple. @@ -77,7 +90,6 @@ public: void run() override; private: - MemoryGroup _memory_group; struct Impl; std::unique_ptr<Impl> _impl; }; diff --git a/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h b/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h index aeeaefcc38..5dee61a4a8 100644 --- a/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h +++ b/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h @@ -24,9 +24,9 @@ #ifndef ARM_COMPUTE_NESPACETOBATCHLAYER_H #define ARM_COMPUTE_NESPACETOBATCHLAYER_H +#include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/Types.h" #include <memory> namespace arm_compute @@ -36,7 +36,7 @@ class ITensorInfo; class NESpaceToBatchLayerKernel; class NEFill; -/** Basic function to spatial divide a tensor. This function calls the following Neon kernels/functions: +/** Basic function to spatial divide a tensor. This function calls the following kernels/functions: * * -# @ref NEFill * -# @ref NESpaceToBatchLayerKernel @@ -58,6 +58,15 @@ public: ~NESpaceToBatchLayer(); /** Set the input and output tensors. * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:---------|:---------|:---------|:---------| + * |All |S32 |S32 |All | + * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. * @param[in] block_shape 1-D tensor with shape [M]. Supported M: 2. Data types supported: S32 * @param[in] paddings 2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32 @@ -73,7 +82,12 @@ public: * @param[in] padding_right The padding at the end of every dimension of the output tensor. * @param[out] output Tensor output. Data types supported: same as @p input */ - void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output); + void configure(const ITensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayer * * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All. @@ -83,7 +97,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output); /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayer (Static block shape and paddings) * * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All. @@ -95,7 +112,12 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + const ITensorInfo *output); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h b/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h index d76fc48204..1820cb8f6b 100644 --- a/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h +++ b/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h @@ -35,10 +35,7 @@ class ITensor; class ITensorInfo; class NESpaceToDepthLayerKernel; -/** This function calls the following Neon kernels/functions: - * - * -# @ref NESpaceToDepthLayerKernel - */ +/** Basic function to run @ref NESpaceToDepthLayerKernel. */ class NESpaceToDepthLayer : public IFunction { public: @@ -56,6 +53,15 @@ public: ~NESpaceToDepthLayer(); /** Set the input and output tensors. * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |All |All | + * * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. * @param[out] output Tensor output. Data types supported: same as @p input * @param[in] block_shape Block shape value diff --git a/arm_compute/runtime/NEON/functions/NESplit.h b/arm_compute/runtime/NEON/functions/NESplit.h index ede5ecf65a..36358a7094 100644 --- a/arm_compute/runtime/NEON/functions/NESplit.h +++ b/arm_compute/runtime/NEON/functions/NESplit.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,7 +26,6 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" - #include "arm_compute/runtime/CPP/functions/CPPSplit.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/NEON/functions/NESlice.h" @@ -40,6 +39,18 @@ namespace arm_compute class NESplit : public CPPSplit<NESlice> { public: + /** NESplit + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |All |All | + * + */ + // Inherited methods overridden: void run() override; }; diff --git a/arm_compute/runtime/NEON/functions/NEStackLayer.h b/arm_compute/runtime/NEON/functions/NEStackLayer.h index f6fa4f2eb3..98dacde0c1 100644 --- a/arm_compute/runtime/NEON/functions/NEStackLayer.h +++ b/arm_compute/runtime/NEON/functions/NEStackLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NESTACKLAYER_H -#define ARM_COMPUTE_NESTACKLAYER_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESTACKLAYER_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESTACKLAYER_H #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" @@ -58,6 +58,14 @@ public: ~NEStackLayer(); /** Initialise the kernel's inputs vector and output. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |All |All | + * * @note Supported input tensor rank: up to 4 * * @param[in] input The vectors containing all the tensors with the same shape to stack. Data types supported: All @@ -83,9 +91,8 @@ public: void run() override; private: - std::vector<ITensor *> _input; - std::vector<std::unique_ptr<NEStackLayerKernel>> _stack_kernels; - unsigned int _num_inputs; + std::unique_ptr<NEStackLayerKernel> _stack_kernel; + bool _is_prepared; }; } // namespace arm_compute -#endif /* ARM_COMPUTE_NESTACKLAYER_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESTACKLAYER_H diff --git a/arm_compute/runtime/NEON/functions/NEStridedSlice.h b/arm_compute/runtime/NEON/functions/NEStridedSlice.h index f9c94f5301..fa1113ffec 100644 --- a/arm_compute/runtime/NEON/functions/NEStridedSlice.h +++ b/arm_compute/runtime/NEON/functions/NEStridedSlice.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -32,18 +32,37 @@ namespace arm_compute // Forward Declarations class ITensor; -namespace experimental -{ /** Basic function to run @ref NEStridedSliceKernel */ -class NEStridedSlice : public INEOperator +class NEStridedSlice : public IFunction { public: + /** Default Constructor */ + NEStridedSlice(); + /** Default Destructor */ + ~NEStridedSlice(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEStridedSlice(const NEStridedSlice &) = delete; + /** Default move constructor */ + NEStridedSlice(NEStridedSlice &&); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEStridedSlice &operator=(const NEStridedSlice &) = delete; + /** Default move assignment operator */ + NEStridedSlice &operator=(NEStridedSlice &&); + /** Configure kernel * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:------|:------| + * |All |All | + * * @note Supported tensor rank: up to 4 * - * @param[in] input Source tensor info. Data type supported: All - * @param[out] output Destination tensor info. Data type supported: Same as @p input + * @param[in] input Source tensor. Data type supported: All + * @param[out] output Destination tensor. Data type supported: Same as @p input * @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input). * @param[in] ends The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input). * @param[in] strides The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input). @@ -52,9 +71,14 @@ public: * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1. * A slice of size 1 starting from starts[i] in the dimension must be preserved. */ - void configure(const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0); + void configure(const ITensor *input, + ITensor *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask = 0, + int32_t end_mask = 0, + int32_t shrink_axis_mask = 0); /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSlice * @@ -70,35 +94,35 @@ public: * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1. * A slice of size 1 starting from starts[i] in the dimension must be preserved. */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask = 0, + int32_t end_mask = 0, + int32_t shrink_axis_mask = 0); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr<Impl> _impl; }; -} // namespace experimental +namespace experimental +{ /** Basic function to run @ref NEStridedSliceKernel */ -class NEStridedSlice : public IFunction +class NEStridedSlice : public INEOperator { public: - /** Default Constructor */ - NEStridedSlice(); - /** Default Destructor */ - ~NEStridedSlice(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEStridedSlice(const NEStridedSlice &) = delete; - /** Default move constructor */ - NEStridedSlice(NEStridedSlice &&); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEStridedSlice &operator=(const NEStridedSlice &) = delete; - /** Default move assignment operator */ - NEStridedSlice &operator=(NEStridedSlice &&); - /** Configure kernel * * @note Supported tensor rank: up to 4 * - * @param[in] input Source tensor. Data type supported: All - * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] input Source tensor info. Data type supported: All + * @param[out] output Destination tensor info. Data type supported: Same as @p input * @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input). * @param[in] ends The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input). * @param[in] strides The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input). @@ -107,9 +131,14 @@ public: * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1. * A slice of size 1 starting from starts[i] in the dimension must be preserved. */ - void configure(const ITensor *input, ITensor *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0); + void configure(const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask = 0, + int32_t end_mask = 0, + int32_t shrink_axis_mask = 0); /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSlice * @@ -125,16 +154,15 @@ public: * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1. * A slice of size 1 starting from starts[i] in the dimension must be preserved. */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0); - - // Inherited methods overridden: - void run() override; - -private: - struct Impl; - std::unique_ptr<Impl> _impl; + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask = 0, + int32_t end_mask = 0, + int32_t shrink_axis_mask = 0); }; +} // namespace experimental } // namespace arm_compute #endif /* ARM_COMPUTE_NE_STRIDED_SLICE_H */ diff --git a/arm_compute/runtime/NEON/functions/NETile.h b/arm_compute/runtime/NEON/functions/NETile.h index d5ce76c9cf..001a0a4128 100644 --- a/arm_compute/runtime/NEON/functions/NETile.h +++ b/arm_compute/runtime/NEON/functions/NETile.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NETILE_H #define ARM_COMPUTE_NETILE_H -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" namespace arm_compute { @@ -39,6 +38,14 @@ class NETile : public INESimpleFunctionNoBorder public: /** Set the source, destination of the kernel * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |All |All | + * * @param[in] input Source tensor. Data type supported: All. * @param[out] output Destination tensor. Same as @p input * @param[in] multiples Contains the number of times the input tensor should be replicated on the given dimension. diff --git a/arm_compute/runtime/NEON/functions/NETranspose.h b/arm_compute/runtime/NEON/functions/NETranspose.h index fac1d406fb..5d2d1f1b01 100644 --- a/arm_compute/runtime/NEON/functions/NETranspose.h +++ b/arm_compute/runtime/NEON/functions/NETranspose.h @@ -25,23 +25,42 @@ #define ARM_COMPUTE_NETRANSPOSE_H #include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +#include "arm_compute/runtime/IFunction.h" + +#include <memory> namespace arm_compute { +// Forward declarations class ITensor; class ITensorInfo; -/** Basic function to transpose a matrix on Neon. This function calls the following Neon kernel: - * - * -# @ref NETransposeKernel - * - */ -class NETranspose : public INESimpleFunctionNoBorder +/** Basic function to run @ref cpu::kernels::CpuTransposeKernel */ +class NETranspose : public IFunction { public: + /** Default Constructor */ + NETranspose(); + /** Default Destructor */ + ~NETranspose(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NETranspose(const NETranspose &) = delete; + /** Default move constructor */ + NETranspose(NETranspose &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NETranspose &operator=(const NETranspose &) = delete; + /** Default move assignment operator */ + NETranspose &operator=(NETranspose &&) = default; /** Initialise the kernel's inputs and output * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:------|:------| + * |All |All | + * * @param[in] input Input tensor. Data types supported: All * @param[out] output Output tensor. Data type supported: Same as @p input */ @@ -54,7 +73,13 @@ public: * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *output); + + // Inherited methods overridden + void run() override; + +private: + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute - #endif /* ARM_COMPUTE_NETRANSPOSE_H */ diff --git a/arm_compute/runtime/NEON/functions/NEUnstack.h b/arm_compute/runtime/NEON/functions/NEUnstack.h index c8e85115f7..e1af96d08d 100644 --- a/arm_compute/runtime/NEON/functions/NEUnstack.h +++ b/arm_compute/runtime/NEON/functions/NEUnstack.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,7 +26,6 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" - #include "arm_compute/runtime/NEON/functions/NEStridedSlice.h" #include <memory> @@ -57,6 +56,14 @@ public: ~NEUnstack() = default; /** Set the input, output and unstacking axis. * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |All |All | + * * @param[in] input A tensor to be unstacked. Data type supported: All. * @param[in,out] output_vector A vector of tensors. Data types supported: same as @p input. * Note: The number of elements of the vector will be used as the number of slices to be taken from the axis. diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h index e41cdbd0ac..6caa2aeb59 100644 --- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,17 +21,12 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NEWINOGRADCONVOLUTIONLAYER_H -#define ARM_COMPUTE_NEWINOGRADCONVOLUTIONLAYER_H - -#include "arm_compute/runtime/IFunction.h" +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEWINOGRADCONVOLUTIONLAYER_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEWINOGRADCONVOLUTIONLAYER_H #include "arm_compute/core/Types.h" -#include "arm_compute/runtime/CPP/functions/CPPPermute.h" -#include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" -#include "arm_compute/runtime/NEON/functions/NEGEMM.h" - +#include "arm_compute/function_info/ActivationLayerInfo.h" +#include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/Tensor.h" #include <memory> @@ -40,13 +35,12 @@ namespace arm_compute { // Forward declarations class ITensor; -class ICPPKernel; -/** Basic function to simulate a convolution layer. This function calls the following Neon kernels: - * -# @ref NEWinogradLayerTransformWeightsKernel (executed only once in the first call to the run() method ) - * -# @ref NEWinogradLayerTransformInputKernel - * -# @ref NEWinogradLayerTransformOutputKernel - * -# @ref NEGEMMAssemblyDispatch +/** Basic function to simulate a convolution layer. This function calls the following kernels: + * + * -# @ref cpu::CpuWinogradConv2dTransformInputKernel + * -# @ref cpu::CpuWinogradConv2dTransformOutputKernel + * -# @ref cpu::CpuGemmAssemblyDispatch * -# @ref CPPPermute (three times: weights, input and output) * * @note Some Winograd configurations (i.e. F(2x2, 5x5), F(4x4, 5x5)) are supported only with enable_fast_math = true @@ -56,20 +50,35 @@ class NEWinogradConvolutionLayer : public IFunction public: /** Constructor */ NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr); - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEWinogradConvolutionLayer(NEWinogradConvolutionLayer &&) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEWinogradConvolutionLayer &operator=(NEWinogradConvolutionLayer &&) = delete; - /** Default destructor */ - ~NEWinogradConvolutionLayer() = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEWinogradConvolutionLayer(const NEWinogradConvolutionLayer &) = delete; + /** Default move constructor */ + NEWinogradConvolutionLayer(NEWinogradConvolutionLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEWinogradConvolutionLayer &operator=(const NEWinogradConvolutionLayer &) = delete; + /** Default move assignment operator */ + NEWinogradConvolutionLayer &operator=(NEWinogradConvolutionLayer &&) = default; + /** Destructor */ + ~NEWinogradConvolutionLayer(); /** Set the input and output tensors. * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:--------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. * Data types supported: F16/F32. * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input. - * Currently only 3x3 and 5x5 kernels are supported. + * Supported kernel sizes: (height, width) -> 3x3, 1x3, 3x1, 5x5, 1x5, 5x1 for Fp32 + * -> 3x3 for Fp16 * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. * Data types supported: Same as @p input. @@ -78,62 +87,35 @@ public: * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation * available which may introduce a drop of accuracy as well. Default is false */ - void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo(), - bool enable_fast_math = false); + void configure(const ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); // Inherited methods overridden: void run() override; void prepare() override; - /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer + /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradConvolutionLayer * - * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. - * Currently only 3x3 and 5x5 kernels are supported. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. - * @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. - * Data types supported: Same as @p input. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. Currently only unit strides are supported. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation - * available which may introduce a drop of accuracy as well. Default is false + * Similar to @ref NEWinogradConvolutionLayer::configure() * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); - - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEWinogradConvolutionLayer(const NEWinogradConvolutionLayer &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEWinogradConvolutionLayer &operator=(const NEWinogradConvolutionLayer &) = delete; + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); private: - MemoryGroup _memory_group; - NEGEMM _gemm_function; - std::unique_ptr<ICPPKernel> _transform_input_kernel; - std::unique_ptr<ICPPKernel> _transform_output_kernel; - std::unique_ptr<ICPPKernel> _transform_weights_kernel; - NEActivationLayer _activationlayer_function; - - CPPPermute _permute_input; - CPPPermute _permute_weights; - CPPPermute _permute_output; - Tensor _input_transformed; - Tensor _output_transformed; - Tensor _input_workspace; - Tensor _output_workspace; - Tensor _kernel_storage; - Tensor _input_nhwc; - Tensor _output_nhwc; - Tensor _weights_hwio; - const ITensor *_input; - const ITensor *_weights; - ITensor *_output; - bool _is_prepared; - bool _is_activationlayer_enabled; + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute -#endif /* ARM_COMPUTE_NEWINOGRADCONVOLUTIONLAYER_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEWINOGRADCONVOLUTIONLAYER_H diff --git a/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h b/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h deleted file mode 100644 index 7f63717b02..0000000000 --- a/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2019 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NEDEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H -#define ARM_COMPUTE_NEDEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H - -#include "arm_compute/runtime/IFunction.h" - -#include "arm_compute/runtime/IMemoryManager.h" -#include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/Tensor.h" - -namespace arm_compute -{ -/** Depthwise convolution assembly kernel glue */ -class NEDepthwiseConvolutionAssemblyDispatch : public IFunction -{ -public: - /** Default constructor - * - * @param[in,out] memory_manager Memory manager to use - */ - NEDepthwiseConvolutionAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager = nullptr); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEDepthwiseConvolutionAssemblyDispatch(const NEDepthwiseConvolutionAssemblyDispatch &) = delete; - /** Default move constructor */ - NEDepthwiseConvolutionAssemblyDispatch(NEDepthwiseConvolutionAssemblyDispatch &&) = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEDepthwiseConvolutionAssemblyDispatch &operator=(const NEDepthwiseConvolutionAssemblyDispatch &) = delete; - /** Default move assignment operator */ - NEDepthwiseConvolutionAssemblyDispatch &operator=(NEDepthwiseConvolutionAssemblyDispatch &&) = default; - /** Default destructor */ - ~NEDepthwiseConvolutionAssemblyDispatch(); - /** Initialize the function's source, destination, kernels and border_size. - * - * @note Supports only NHWC format - * - * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). - * @param[in] weights Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input. - * @param[in] bias (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input. - * @param[out] output Destination tensor. Data type supported: same as @p input. - * @param[in] conv_info Padding and stride information to use for the convolution. - * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - */ - void configure(const ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, - const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), - const Size2D &dilation = Size2D(1, 1)); - /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionAssemblyDispatch - * - * @note Supports only NHWC format - * - * @param[in] input Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling). - * @param[in] weights Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input. - * @param[in] bias (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p input. - * @param[out] output Destination tensor. Data type supported: same as @p input. - * @param[in] conv_info Padding and stride information to use for the convolution. - * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - * - * @return An error status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, - const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), - const Size2D &dilation = Size2D(1, 1)); - /** Check if the optimized kernel can be used for the given kernel sizes and strides - * - * @warning Even if this return true the inputs and outputs might need to get permuted as the only layout supported is NHWC - * - * @param[in] input Input tensor info. - * @param[in] weights Weights tensor info. - * @param[in] conv_info Convolution layer metadata. - * @param[in] depth_multiplier (Optional) Depth multiplier to be used. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - * - * @return True if the assembly kernel could be used else false. Note that transformations of input/output could be needed. - */ - static bool is_optimized_supported(const ITensorInfo *input, const ITensorInfo *weights, PadStrideInfo conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1, 1)); - - // Inherited methods overridden: - void run() override; - void prepare() override; - -private: - struct LocalImpl; - -private: - MemoryGroup _memory_group; - const ITensor *_input; - const ITensor *_weights; - const ITensor *_bias; - ITensor *_output; - Tensor _packed_weights; - Tensor _workspace; - bool _is_prepared; - std::unique_ptr<LocalImpl> _pImpl; -}; -} // namespace arm_compute -#endif /* ARM_COMPUTE_NEDEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H */ |