diff options
Diffstat (limited to 'arm_compute/runtime/NEON/functions')
71 files changed, 1778 insertions, 1339 deletions
diff --git a/arm_compute/runtime/NEON/functions/NEActivationLayer.h b/arm_compute/runtime/NEON/functions/NEActivationLayer.h index b39a8d7701..5584fdc783 100644 --- a/arm_compute/runtime/NEON/functions/NEActivationLayer.h +++ b/arm_compute/runtime/NEON/functions/NEActivationLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,9 +24,9 @@ #ifndef ARM_COMPUTE_NEACTIVATIONLAYER_H #define ARM_COMPUTE_NEACTIVATIONLAYER_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" +#include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IRuntimeContext.h" #include <memory> @@ -101,5 +101,5 @@ private: struct Impl; std::unique_ptr<Impl> _impl; }; -} // namespace arm_computes +} // namespace arm_compute #endif /* ARM_COMPUTE_NEACTIVATIONLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEAddMulAdd.h b/arm_compute/runtime/NEON/functions/NEAddMulAdd.h new file mode 100644 index 0000000000..6c65c055dd --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEAddMulAdd.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEADDMULADD +#define ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEADDMULADD + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" + +#include <memory> + +namespace arm_compute +{ +class ITensor; +class ITensorInfo; +class ActivationLayerInfo; + +/** Function to compute Add+Mul+Add fused operation */ +class NEAddMulAdd : public IFunction +{ +public: + /** Constructor */ + NEAddMulAdd(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEAddMulAdd(const NEAddMulAdd &) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + NEAddMulAdd(NEAddMulAdd &&) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEAddMulAdd &operator=(const NEAddMulAdd &) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + NEAddMulAdd &operator=(NEAddMulAdd &&) = delete; + /** Destructor */ + ~NEAddMulAdd(); + /** Initialize the function's inputs and outputs. + * + * Valid data layouts: + * - Any + * + * Valid data type configurations: + * |input1 |input2 |bn_mul |bn_add |add_output |final_output | + * |:--------------|:--------------|:--------------|:--------------|:--------------|:--------------| + * |QASYMM8 |QASYMM8 |QASYMM8 |QASYMM8 |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |F16 |F16 |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 |F32 |F32 | + * + * This is what this composite function (tailored for add followed by a batch norm operation) does: + * add_output <- input1 + input2 (add) + * final_output <- add_output * bn_mul + bn_add (batch norm = mul+add) + * + * @param[in] input1 First tensor input. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] input2 Second tensor input. Data types supported: Same as @p input. + * @param[in] bn_mul The multiplication coefficient on the feature dimension. Data types supported: Same as @p input. + * It's one dimensional tensor with size equal to the feature maps [FM] + * @param[in] bn_add The addition coefficient on the feature dimension. Data types supported: Same as @p input. + * It's one dimensional tensor with size equal to the feature maps [FM] + * @param[out] add_output Output of the first add. Data type supported: Same as @p input. + * @param[out] final_output Output of the add+mul+add+act composite operation. Data type supported: Same as @p input. + * @param[in] policy Policy to handle overflow + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * + */ + void configure(ITensor *input1, + ITensor *input2, + ITensor *bn_mul, + ITensor *bn_add, + ITensor *add_output, + ITensor *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info); + /** Static function to check if given info will lead to a valid configuration of @ref NEAddMulAdd + * + * Similar to @ref NEAddMulAdd::configure() except the arguments are @ref ITensorInfo * instead of @ref ITensor * + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr<Impl> _impl; +}; +} // namespace arm_compute +#endif /* ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEADDMULADD */ diff --git a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h index 4392de7b28..3bb50a0f90 100644 --- a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h +++ b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,8 +24,6 @@ #ifndef ARM_COMPUTE_NEARGMINMAXLAYER_H #define ARM_COMPUTE_NEARGMINMAXLAYER_H -#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" - #include "arm_compute/core/Types.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/INESimpleFunction.h" @@ -33,7 +31,6 @@ namespace arm_compute { class ITensor; - /** Function to calculate the index of the minimum or maximum values in a * tensor based on an axis. * @@ -68,13 +65,13 @@ public: * - All * * Valid data type configurations: - * |src |dst | - * |:--------------|:----------| - * |QASYMM8 |U32, S32 | - * |QASYMM8_SIGNED |U32, S32 | - * |S32 |U32, S32 | - * |F16 |U32, S32 | - * |F32 |U32, S32 | + * |src |dst | + * |:--------------|:-------------| + * |QASYMM8 |U32, S32 | + * |QASYMM8_SIGNED |U32, S32 | + * |S32 |U32, S32, S64 | + * |F16 |U32, S32 | + * |F32 |U32, S32 | * * @param[in] input Input source tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/S32/F16/F32. * @param[in] axis Axis to find max/min index. @@ -86,7 +83,7 @@ public: * * @param[in] input Input source tensor info. Data types supported: QASYMM8_SIGNED/QASYMM8/S32/F16/F32. * @param[in] axis Axis to find max/min index. - * @param[in] output Output source tensor info. Data types supported: U32/S32. + * @param[in] output Output source tensor info. Data types supported: U32/S32/S64. * @param[in] op Operation to perform: min or max * * @return a status @@ -97,7 +94,8 @@ public: void run() override; private: - std::unique_ptr<NEReductionOperation> _reduction_function; + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEARGMINMAXLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h index b8e46ff36e..73a43dbc44 100644 --- a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h +++ b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021 Arm Limited. + * Copyright (c) 2016-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,7 +25,9 @@ #define ARM_COMPUTE_NEARITHMETICADDITION_H #include "arm_compute/core/Types.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/IFunction.h" + #include <memory> namespace arm_compute @@ -62,9 +64,6 @@ public: * |QSYMM16 |QSYMM16 |QASYMM16 | * |QSYMM16 |QSYMM16 |S32 | * |U8 |U8 |U8 | - * |U8 |U8 |S16 | - * |U8 |S16 |S16 | - * |S16 |U8 |S16 | * |S16 |S16 |S16 | * |S32 |S32 |S32 | * |F16 |F16 |F16 | @@ -76,7 +75,11 @@ public: * @param[in] policy Policy to use to handle overflow. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAddition * * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 @@ -87,7 +90,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h index 0c72e946f6..3e4f6356c5 100644 --- a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h +++ b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021 Arm Limited. + * Copyright (c) 2016-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEARITHMETICSUBTRACTION_H #include "arm_compute/core/Types.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/NEON/INEOperator.h" @@ -68,9 +69,6 @@ public: * |QSYMM16 |QSYMM16 |QASYMM16 | * |QSYMM16 |QSYMM16 |S32 | * |U8 |U8 |U8 | - * |U8 |U8 |S16 | - * |U8 |S16 |S16 | - * |S16 |U8 |S16 | * |S16 |S16 |S16 | * |S32 |S32 |S32 | * |F16 |F16 |F16 | @@ -82,7 +80,11 @@ public: * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtraction * * @param[in] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32 @@ -93,7 +95,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h index ec00fbdbf2..99e2dcadbb 100644 --- a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h +++ b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h @@ -81,7 +81,13 @@ public: * @param[in] epsilon (Optional) Small value to avoid division with zero. Default value is 0.001f. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. */ - void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta = nullptr, const ITensor *gamma = nullptr, float epsilon = 0.001f, + void configure(ITensor *input, + ITensor *output, + const ITensor *mean, + const ITensor *var, + const ITensor *beta = nullptr, + const ITensor *gamma = nullptr, + float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayer * @@ -98,10 +104,14 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const ITensorInfo *mean, const ITensorInfo *var, - const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr, - float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *mean, + const ITensorInfo *var, + const ITensorInfo *beta = nullptr, + const ITensorInfo *gamma = nullptr, + float epsilon = 0.001f, + ActivationLayerInfo act_info = ActivationLayerInfo()); // Inherited methods overridden: void run() override; @@ -109,5 +119,5 @@ public: private: std::unique_ptr<NEBatchNormalizationLayerKernel> _norm_kernel; /**< Batch normalization layer kernel */ }; -} +} // namespace arm_compute #endif /* ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h index 810bf81a22..ebed0bea29 100644 --- a/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h +++ b/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NEBATCHTOSPACELAYER_H #define ARM_COMPUTE_NEBATCHTOSPACELAYER_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" namespace arm_compute @@ -64,7 +63,10 @@ public: * @param[in] input Tensor input. Supported tensor rank: 4. Data types supported: All. * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32 * @param[out] output Tensor output. Data types supported: same as @p input + * + * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release */ + ARM_COMPUTE_DEPRECATED_REL(23.05) void configure(const ITensor *input, const ITensor *block_shape, ITensor *output); /** Set the input and output tensors. (Static block shape). * @@ -72,8 +74,13 @@ public: * @param[in] block_shape_x Block shape x value. * @param[in] block_shape_y Block shape y value. * @param[out] output Tensor output. Data types supported: same as @p input + * @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed */ - void configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output); + void configure(const ITensor *input, + int32_t block_shape_x, + int32_t block_shape_y, + ITensor *output, + const CropInfo &crop_info = CropInfo{}); /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayer * * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All. @@ -81,7 +88,9 @@ public: * @param[out] output Tensor output info. Data types supported: same as @p input * * @return a status + * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release */ + ARM_COMPUTE_DEPRECATED_REL(23.05) static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output); /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayer (Static block shape). * @@ -89,10 +98,15 @@ public: * @param[in] block_shape_x Block shape x value. * @param[in] block_shape_y Block shape y value. * @param[out] output Tensor output info. Data types supported: same as @p input + * @param[in] crop_info Specifies how the output shape is cropped after batch to space is performed * * @return a status */ - static Status validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + int32_t block_shape_x, + int32_t block_shape_y, + const ITensorInfo *output, + const CropInfo &crop_info = CropInfo{}); }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEBATCHTOSPACELAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h index 2a196a2de5..aa41fc0df2 100644 --- a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h +++ b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h @@ -57,7 +57,8 @@ public: * * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct. */ - void configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info); + void + configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref NEBoundingBoxTransform * @@ -71,7 +72,10 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info); + static Status validate(const ITensorInfo *boxes, + const ITensorInfo *pred_boxes, + const ITensorInfo *deltas, + const BoundingBoxTransformInfo &info); }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEBOUNDINGBOXTRANSFORM_H */ diff --git a/arm_compute/runtime/NEON/functions/NECast.h b/arm_compute/runtime/NEON/functions/NECast.h index 30499f5ecf..43cae777f6 100644 --- a/arm_compute/runtime/NEON/functions/NECast.h +++ b/arm_compute/runtime/NEON/functions/NECast.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NECAST_H #define ARM_COMPUTE_NECAST_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" #include <memory> @@ -85,7 +84,7 @@ public: * * @return a status */ - static Status validate(ITensorInfo *input, ITensorInfo *output, ConvertPolicy policy); + static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy); // Inherited methods overridden void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h b/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h index 8888efec4f..bc19e1a4af 100644 --- a/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h +++ b/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h @@ -46,6 +46,7 @@ public: * * Valid data layouts: * - NCHW + * - NHWC * * Valid data type configurations: * |src |dst | diff --git a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h index dd1c709d76..1600f85488 100644 --- a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h +++ b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NECONCATENATELAYER_H #define ARM_COMPUTE_NECONCATENATELAYER_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" #include <memory> @@ -87,7 +86,8 @@ public: * * @return a status */ - static Status validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis); + static Status + validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEConv3D.h b/arm_compute/runtime/NEON/functions/NEConv3D.h new file mode 100644 index 0000000000..525f37f3e7 --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEConv3D.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_NECONV3D_H +#define ARM_COMPUTE_NECONV3D_H + +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/FunctionDescriptors.h" +#include "arm_compute/runtime/IFunction.h" + +#include <memory> + +namespace arm_compute +{ +// Forward declarations +class ITensor; + +/** Basic function to simulate a 3d convolution. This function calls one of the following functions: + * -# @ref cpu::CpuDirectConv3d + * + */ +class NEConv3D : public IFunction +{ +public: + /** Constructor */ + NEConv3D(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEConv3D(const NEConv3D &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEConv3D &operator=(const NEConv3D &) = delete; + /** Default move constructor */ + NEConv3D(NEConv3D &&) = default; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + NEConv3D &operator=(NEConv3D &&) = default; + /** Default destructor */ + ~NEConv3D(); + /** Set the input and output tensors. + * + * Valid data layouts: + * - NDHWC + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * + * @param[in] input Source tensor. 4 lower dimensions represent a single input [IFM, width, height, depth], + * while every optional dimension from 5 and above represent a batch of inputs. + * @param[in] weights Weights tensor. Weights are 5D tensor with dimensions [OFM, IFM, kernel_x, kernel_y, kernel_z]. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * @param[out] output Destination tensor. 4 lower dimensions represent a single output [OFM, width, height, depth], while the rest represent batch of outputs. + * @param[in] conv_info Contains padding, stride, acitvation information described in @ref Conv3dInfo. + */ + void configure( + ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to NEConv3D::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const Conv3dInfo &conv_info); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr<Impl> _impl; +}; +} // namespace arm_compute +#endif /* ARM_COMPUTE_NECONV3D_H */ diff --git a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h index 218877d421..dc6b22d717 100644 --- a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h +++ b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h @@ -24,14 +24,14 @@ #ifndef ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H #define ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H +#include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/runtime/ITransformWeights.h" -#include "arm_compute/runtime/Tensor.h" namespace arm_compute { // Forward declarations class ITensor; +class ITensorInfo; /** Basic function to run @ref cpu::kernels::CpuConvertFullyConnectedWeightsKernel. */ class NEConvertFullyConnectedWeights : public IFunction @@ -65,7 +65,8 @@ public: * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer). * @param[in] data_layout The data layout the weights have been trained in. */ - void configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, DataLayout data_layout); + void + configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, DataLayout data_layout); /** Static function to check if given info will lead to a valid configuration of @ref NEConvertFullyConnectedWeights * * @param[in] input Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All. @@ -75,7 +76,10 @@ public: * * @return A Status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, DataLayout data_layout); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const TensorShape &original_input_shape, + DataLayout data_layout); // Inherited methods overriden: void run() override; @@ -84,45 +88,5 @@ private: struct Impl; std::unique_ptr<Impl> _impl; }; - -namespace weights_transformations -{ -/** Basic function to manage @ref NEConvertFullyConnectedWeights. */ -class NEConvertFullyConnectedWeightsManaged : public ITransformWeights -{ -public: - void run() override - { - _output.allocator()->allocate(); - _func.run(); - _reshape_run = true; - } - - void release() override - { - _output.allocator()->free(); - } - - ITensor *get_weights() override - { - return &_output; - } - - uint32_t uid() override - { - return _uid; - } - - void configure(const ITensor *input, const TensorShape &original_input_shape, DataLayout data_layout) - { - _func.configure(input, &_output, original_input_shape, data_layout); - } - -private: - static constexpr uint32_t _uid = 0x4; - Tensor _output{}; - NEConvertFullyConnectedWeights _func{}; -}; -} // namespace weights_transformations } // namespace arm_compute #endif /* ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H */ diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h index f19aa8008b..2d07980ade 100644 --- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,13 +21,13 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NECONVOLUTIONLAYER_H -#define ARM_COMPUTE_NECONVOLUTIONLAYER_H - -#include "arm_compute/runtime/IFunction.h" +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NECONVOLUTIONLAYER_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NECONVOLUTIONLAYER_H #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/Types.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" +#include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/MemoryGroup.h" #include <memory> @@ -38,9 +38,9 @@ namespace arm_compute class ITensor; /** Basic function to simulate a convolution layer. This function calls one of the following functions: - * -# @ref NEGEMMConvolutionLayer (executed only in case GEMM is required for the operation) - * -# @ref NEWinogradConvolutionLayer (executed only in case Winograd is required for the operation) - * -# @ref NEDirectConvolutionLayer (executed only in case Direct Convolution is required for the operation) + * -# @ref cpu::CpuGemmConv2d (executed only in case GEMM is required for the operation) + * -# @ref cpu::CpuWinogradConv2d (executed only in case Winograd is required for the operation) + * -# @ref cpu::CpuDirectConv2d (executed only in case Direct Convolution is required for the operation) * -# @ref NEFFTConvolutionLayer (executed only in case FFT is required for the operation) * * @@ -78,12 +78,12 @@ public: NEConvolutionLayer(const NEConvolutionLayer &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ NEConvolutionLayer &operator=(const NEConvolutionLayer &) = delete; + /** Default move constructor */ + NEConvolutionLayer(NEConvolutionLayer &&) = default; /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEConvolutionLayer(NEConvolutionLayer &&) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEConvolutionLayer &operator=(NEConvolutionLayer &&) = delete; + NEConvolutionLayer &operator=(NEConvolutionLayer &&) = default; /** Default destructor */ - ~NEConvolutionLayer() = default; + ~NEConvolutionLayer(); /** Set the input and output tensors. * * Valid data layouts: @@ -111,15 +111,23 @@ public: * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input. + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation * available which may introduce a drop of accuracy as well. Default is false * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported */ - void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(), - const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1); + void configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); /** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayer * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], @@ -133,7 +141,7 @@ public: * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input. + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation @@ -142,9 +150,16 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, - unsigned int num_groups = 1); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); /** Static function to check if given info will return the convolution called by @ref NEConvolutionLayer * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], @@ -156,7 +171,7 @@ public: * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input. + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation @@ -164,15 +179,21 @@ public: * * @return the Convolution Method Hint */ - static ConvolutionMethod get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); + static ConvolutionMethod get_convolution_method(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); // Inherited methods overridden: void run() override; void prepare() override; private: - std::shared_ptr<IMemoryManager> _memory_manager; - std::unique_ptr<IFunction> _function; /**< Function to run */ + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute -#endif /* ARM_COMPUTE_NECONVOLUTIONLAYER_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NECONVOLUTIONLAYER_H diff --git a/arm_compute/runtime/NEON/functions/NECopy.h b/arm_compute/runtime/NEON/functions/NECopy.h index ee02c259f4..840c03e968 100644 --- a/arm_compute/runtime/NEON/functions/NECopy.h +++ b/arm_compute/runtime/NEON/functions/NECopy.h @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NECOPY_H #define ARM_COMPUTE_NECOPY_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" #include <memory> diff --git a/arm_compute/runtime/NEON/functions/NECropResize.h b/arm_compute/runtime/NEON/functions/NECropResize.h index 143bbbc6f1..f806762158 100644 --- a/arm_compute/runtime/NEON/functions/NECropResize.h +++ b/arm_compute/runtime/NEON/functions/NECropResize.h @@ -75,8 +75,13 @@ public: * @param[in] method The policy to be used when resizing image. Default is bilinear. * @param[in] extrapolation_value Value to be used for values outside of the image for cropping and resizing. Default is 0. */ - void configure(const ITensor *input, const ITensor *boxes, const ITensor *box_ind, ITensor *output, Coordinates2D crop_size, - InterpolationPolicy method = InterpolationPolicy::BILINEAR, float extrapolation_value = 0); + void configure(const ITensor *input, + const ITensor *boxes, + const ITensor *box_ind, + ITensor *output, + Coordinates2D crop_size, + InterpolationPolicy method = InterpolationPolicy::BILINEAR, + float extrapolation_value = 0); /** Static function to check if given info will lead to a valid configuration of @ref NESlice * @@ -96,8 +101,13 @@ public: * * @return A status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *boxes, const ITensorInfo *box_ind, const ITensorInfo *output, - Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value); + static Status validate(const ITensorInfo *input, + const ITensorInfo *boxes, + const ITensorInfo *box_ind, + const ITensorInfo *output, + Coordinates2D crop_size, + InterpolationPolicy method, + float extrapolation_value); void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h index 34ab0707c2..aabe42f928 100644 --- a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,15 +24,14 @@ #ifndef ARM_COMPUTE_NEDECONVOLUTIONLAYER_H #define ARM_COMPUTE_NEDECONVOLUTIONLAYER_H -#include "arm_compute/runtime/CPP/functions/CPPUpsample.h" -#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h" -#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h" -#include "arm_compute/runtime/NEON/functions/NEReverse.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CPP/functions/CPPUpsample.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEReverse.h" #include "arm_compute/runtime/Tensor.h" #include <memory> @@ -76,17 +75,16 @@ class NEDeconvolutionLayer : public IFunction public: /** Constructor */ NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); - /** Prevent instances of this class from being copied (As this class contains pointers) */ NEDeconvolutionLayer(const NEDeconvolutionLayer &) = delete; + /** Default move constructor */ + NEDeconvolutionLayer(NEDeconvolutionLayer &&) = default; /** Prevent instances of this class from being copied (As this class contains pointers) */ NEDeconvolutionLayer &operator=(const NEDeconvolutionLayer &) = delete; - /** Prevent instances of this class from being moved (As this class contains pointers) */ - NEDeconvolutionLayer(NEDeconvolutionLayer &&) = delete; - /** Prevent instances of this class from being moved (As this class contains pointers) */ - NEDeconvolutionLayer &operator=(NEDeconvolutionLayer &&) = delete; + /** Default move assignment operator */ + NEDeconvolutionLayer &operator=(NEDeconvolutionLayer &&) = default; /** Default destructor */ - virtual ~NEDeconvolutionLayer() = default; + ~NEDeconvolutionLayer() = default; /** Set the input, weights, biases and output tensors. * @@ -104,25 +102,50 @@ public: * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | * - * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. - * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED. - * @param[in] bias Optional, ignored if NULL. The biases have one dimension. Data type supported: Data types supported: S32 for QASYMM8/QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input. - * @param[out] output Output tensor. The output has the same number of dimensions as the @p input. - * @param[in] info Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo. + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. + * Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. + * @param[in] weights The 4d weights with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED. + * @param[in] bias Optional, ignored if NULL. The biases have one dimension. + * Data type supported: Data types supported: S32 for QASYMM8/QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input. + * @param[out] output Output tensor. The output has the same number of dimensions as the @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + * @param[in] weights_info (Optional) Specifies the weight format. Default is unspecified. This parameter can be used to specify the weight format that is optimal for + * the GEMM convolution. * */ - void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info); + void configure(ITensor *input, + const ITensor *weights, + const ITensor *bias, + ITensor *output, + const PadStrideInfo &info, + bool enable_fast_math = false, + const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEDeconvolutionLayer * - * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. - * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED. - * @param[in] bias (Optional) The biases have one dimension. Data type supported: Data types supported: S32 for QASYMM8/QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input. - * @param[in] output Output tensor info. The output has the same number of dimensions as the @p input. - * @param[in] info Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo. + * @param[in] input Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. + * Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED. + * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. + * Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED. + * @param[in] bias (Optional) The biases have one dimension. Data type supported: Data types supported: S32 for QASYMM8/QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input. + * @param[in] output Output tensor info. The output has the same number of dimensions as the @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + * @param[in] weights_info (Optional) Specifies the weight format. Default is unspecified. This parameter can be used to specify the weight format that is optimal for + * the GEMM convolution. * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *output, + const PadStrideInfo &info, + bool enable_fast_math = false, + const WeightsInfo &weights_info = WeightsInfo()); // Inherited methods overridden: void run() override; @@ -140,6 +163,7 @@ private: ITensor *_input; PadStrideInfo _info; bool _is_prepared; + bool _do_upsampling; }; -} // arm_compute +} // namespace arm_compute #endif /* ARM_COMPUTE_NEDECONVOLUTIONLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h index eb0724ae12..7bfdfbd13d 100644 --- a/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NEDEPTHCONVERT_H #define ARM_COMPUTE_NEDEPTHCONVERT_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" #include <memory> @@ -84,7 +83,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift = 0); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift = 0); // Inherited methods overridden void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h index b9bdcd1f11..d27369670e 100644 --- a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,26 +21,27 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NEDEPTHTOSPACELAYER_H -#define ARM_COMPUTE_NEDEPTHTOSPACELAYER_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H +#include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +#include <memory> namespace arm_compute { // Forward declarations class ITensor; class ITensorInfo; +class NEDepthToSpaceLayerKernel; /** Basic function to run @ref NEDepthToSpaceLayerKernel. */ -class NEDepthToSpaceLayer : public INESimpleFunctionNoBorder +class NEDepthToSpaceLayer : public IFunction { public: /** Constructor */ - NEDepthToSpaceLayer() = default; + NEDepthToSpaceLayer(); /** Prevent instances of this class from being copied (As this class contains pointers) */ NEDepthToSpaceLayer(const NEDepthToSpaceLayer &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ @@ -50,7 +51,7 @@ public: /** Prevent instances of this class from being moved (As this class contains non movable objects) */ NEDepthToSpaceLayer &operator=(NEDepthToSpaceLayer &&) = delete; /** Default destructor */ - ~NEDepthToSpaceLayer() = default; + ~NEDepthToSpaceLayer(); /** Set the input and output tensors. * * Valid data layouts: @@ -76,6 +77,11 @@ public: * @return a status */ static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape); + + void run() override; + +private: + std::unique_ptr<NEDepthToSpaceLayerKernel> _kernel; }; } // namespace arm_compute -#endif /* ARM_COMPUTE_NEDEPTHTOSPACELAYER_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h index 2f541758f4..6ad5aa7bfa 100644 --- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h @@ -28,6 +28,7 @@ #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/NEON/functions/NEPermute.h" + #include <memory> namespace arm_compute @@ -80,8 +81,14 @@ public: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). */ - void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); + void configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + const Size2D &dilation = Size2D(1U, 1U)); /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer * @@ -98,8 +105,14 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + const Size2D &dilation = Size2D(1U, 1U)); // Inherited methods overriden: void run() override; @@ -112,7 +125,7 @@ private: * * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present * -# @ref NEDepthwiseConvolutionLayer3x3Kernel if 3x3 and no assembly kernel implementation is present - * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present + * -# @ref cpu::CpuDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present * -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of output is required * -# @ref NEActivationLayer if fused activation is required * @@ -127,9 +140,11 @@ private: /** Default move constructor */ NEDepthwiseConvolutionLayerOptimizedInternal(NEDepthwiseConvolutionLayerOptimizedInternal &&) = default; /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEDepthwiseConvolutionLayerOptimizedInternal &operator=(const NEDepthwiseConvolutionLayerOptimizedInternal &) = delete; + NEDepthwiseConvolutionLayerOptimizedInternal & + operator=(const NEDepthwiseConvolutionLayerOptimizedInternal &) = delete; /** Default move assignment operator */ - NEDepthwiseConvolutionLayerOptimizedInternal &operator=(NEDepthwiseConvolutionLayerOptimizedInternal &&) = default; + NEDepthwiseConvolutionLayerOptimizedInternal & + operator=(NEDepthwiseConvolutionLayerOptimizedInternal &&) = default; /** Default destructor */ ~NEDepthwiseConvolutionLayerOptimizedInternal() = default; /** Initialize the function's source, destination, kernels and border_size. @@ -144,8 +159,14 @@ private: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). */ - void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); + void configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + const Size2D &dilation = Size2D(1U, 1U)); /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer3x3 * @@ -161,8 +182,14 @@ private: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + const Size2D &dilation = Size2D(1U, 1U)); // Inherited methods overriden: void run() override; @@ -207,8 +234,14 @@ private: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). */ - void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); + void configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + const Size2D &dilation = Size2D(1U, 1U)); /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayerGeneric * @@ -225,8 +258,14 @@ private: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U)); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + unsigned int depth_multiplier = 1, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + const Size2D &dilation = Size2D(1U, 1U)); // Inherited methods overriden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h index 2affa8d49e..7a94833d10 100644 --- a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h @@ -24,13 +24,12 @@ #ifndef ARM_COMPUTE_NE_DETECTION_POSTPROCESS_H #define ARM_COMPUTE_NE_DETECTION_POSTPROCESS_H -#include "arm_compute/runtime/NEON/INESimpleFunction.h" - #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h" +#include "arm_compute/runtime/NEON/INESimpleFunction.h" #include "arm_compute/runtime/Tensor.h" #include <map> @@ -78,8 +77,14 @@ public: * * @note Output contains all the detections. Of those, only the ones selected by the valid region are valid. */ - void configure(const ITensor *input_box_encoding, const ITensor *input_score, const ITensor *input_anchors, - ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo()); + void configure(const ITensor *input_box_encoding, + const ITensor *input_score, + const ITensor *input_anchors, + ITensor *output_boxes, + ITensor *output_classes, + ITensor *output_scores, + ITensor *num_detection, + DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEDetectionPostProcessLayer * * @param[in] input_box_encoding The bounding box input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32. @@ -93,8 +98,13 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors, - ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, + static Status validate(const ITensorInfo *input_box_encoding, + const ITensorInfo *input_class_score, + const ITensorInfo *input_anchors, + ITensorInfo *output_boxes, + ITensorInfo *output_classes, + ITensorInfo *output_scores, + ITensorInfo *num_detection, DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo()); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h index 70352fdfaa..3ae3b2a15c 100644 --- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H #include "arm_compute/core/Types.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" @@ -84,7 +85,12 @@ public: * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensor *input, + const ITensor *weights, + const ITensor *bias, + ITensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayer * * @note: DirectConvolution only works in the following configurations: @@ -105,7 +111,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info, + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *output, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: diff --git a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h index 95274bdb0c..ebf2277d1f 100644 --- a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h +++ b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NEELEMENTWISEOPERATIONS_H #include "arm_compute/core/Types.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/NEON/INEOperator.h" @@ -72,7 +73,10 @@ public: * @param[out] output Output tensor. Data types supported: Same as @p input1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for max * * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. @@ -82,7 +86,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run() override; @@ -132,7 +139,10 @@ public: * @param[out] output Output tensor. Data types supported: Same as @p input1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for min * * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. @@ -142,7 +152,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run() override; @@ -192,7 +205,10 @@ public: * @param[out] output Output tensor. Data types supported: Same as @p input1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for squared difference * * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. @@ -202,7 +218,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run() override; @@ -248,7 +267,10 @@ public: * @param[out] output Output tensor. Data types supported: Same as @p input1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for division * * @param[in] input1 First tensor input info. Data types supported: F16/F32. @@ -258,7 +280,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run() override; @@ -305,7 +330,10 @@ public: * @param[out] output Output tensor. Data types supported: Same as @p input1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for power * * @param[in] input1 First tensor input info. Data types supported: F16/F32. @@ -315,7 +343,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run() override; @@ -376,7 +407,8 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op); + static Status + validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEFFT1D.h b/arm_compute/runtime/NEON/functions/NEFFT1D.h index 9654b1e604..99c6fd4eb4 100644 --- a/arm_compute/runtime/NEON/functions/NEFFT1D.h +++ b/arm_compute/runtime/NEON/functions/NEFFT1D.h @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NEFFT1D_H #define ARM_COMPUTE_NEFFT1D_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/runtime/FunctionDescriptors.h" +#include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/Tensor.h" diff --git a/arm_compute/runtime/NEON/functions/NEFFT2D.h b/arm_compute/runtime/NEON/functions/NEFFT2D.h index 57f38d1942..cefd3df17a 100644 --- a/arm_compute/runtime/NEON/functions/NEFFT2D.h +++ b/arm_compute/runtime/NEON/functions/NEFFT2D.h @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NEFFT2D_H #define ARM_COMPUTE_NEFFT2D_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/runtime/FunctionDescriptors.h" +#include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEFFT1D.h" #include "arm_compute/runtime/Tensor.h" diff --git a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h index c5f4d45b6b..84bfe6b02f 100644 --- a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NEFFTCONVOLUTIONLAYER_H #define ARM_COMPUTE_NEFFTCONVOLUTIONLAYER_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/runtime/NEON/functions/NEFFT2D.h" @@ -94,8 +93,13 @@ public: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * @param[in] enable_fast_math (Optional) Enable fast math computation. Unused for CPU backend. */ - void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); + void configure(ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); /** Static function to check if given info will lead to a valid configuration of @ref NEFFTConvolutionLayer * * @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout @@ -113,8 +117,13 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEFill.h b/arm_compute/runtime/NEON/functions/NEFill.h index e923ce33e1..1829c71fef 100644 --- a/arm_compute/runtime/NEON/functions/NEFill.h +++ b/arm_compute/runtime/NEON/functions/NEFill.h @@ -24,10 +24,9 @@ #ifndef ARM_COMPUTE_NEFILL_H #define ARM_COMPUTE_NEFILL_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" #include <memory> diff --git a/arm_compute/runtime/NEON/functions/NEFillBorder.h b/arm_compute/runtime/NEON/functions/NEFillBorder.h index ab77c28839..44b1d4a62b 100644 --- a/arm_compute/runtime/NEON/functions/NEFillBorder.h +++ b/arm_compute/runtime/NEON/functions/NEFillBorder.h @@ -27,6 +27,7 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" + #include <memory> namespace arm_compute @@ -57,7 +58,10 @@ public: * @param[in] border_mode Strategy to use for borders. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ - void configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); + void configure(ITensor *input, + unsigned int border_width, + BorderMode border_mode, + const PixelValue &constant_border_value = PixelValue()); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEFloor.h b/arm_compute/runtime/NEON/functions/NEFloor.h index 4d47b068db..77ac484bab 100644 --- a/arm_compute/runtime/NEON/functions/NEFloor.h +++ b/arm_compute/runtime/NEON/functions/NEFloor.h @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NEFLOOR_H #define ARM_COMPUTE_NEFLOOR_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" #include <memory> diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h index 9727e108a5..885f8430cf 100644 --- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h +++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,16 +24,15 @@ #ifndef ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H #define ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H +#include "arm_compute/function_info/FullyConnectedLayerInfo.h" #include "arm_compute/runtime/IFunction.h" - -#include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h" -#include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h" -#include "arm_compute/runtime/NEON/functions/NEGEMM.h" -#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/IWeightsManager.h" #include "arm_compute/runtime/NEON/functions/NETranspose.h" #include "arm_compute/runtime/Tensor.h" +#include <memory> + namespace arm_compute { namespace weights_transformations @@ -77,10 +76,10 @@ private: } // namespace weights_transformations /** Basic function to compute a Fully Connected layer. This function calls the following kernels: - * -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer) + * -# @ref cpu::kernels::CpuIm2ColKernel (called when the input comes from a convolutional layer) * -# @ref NETranspose (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once) - * -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized asymmetric) - * -# @ref NEGEMMMatrixAdditionKernel or @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is not equal to nullptr) + * -# @ref NEGEMM or @ref NEGEMMLowpMatrixMultiplyCore (if quantized asymmetric) + * -# @ref cpu::kernels::CpuGemmMatrixAdditionKernel or @ref NEGEMMLowpOutputStage (if quantized asymmetric) (if @p biases is not equal to nullptr) * * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. */ @@ -88,7 +87,8 @@ class NEFullyConnectedLayer : public IFunction { public: /** Constructor */ - NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr); + NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr, + IWeightsManager *weights_manager = nullptr); /** Prevent instances of this class from being copied (As this class contains pointers) */ NEFullyConnectedLayer(const NEFullyConnectedLayer &) = delete; /** Prevent instances of this class from being moved (As this class contains pointers) */ @@ -113,66 +113,65 @@ public: * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | * - * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] weights Weights tensor. The weights must be 2 dimensional. - * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions. - * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension. - * Data type supported: Same as @p input. - * @param[in] biases Bias tensor. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED. - * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix multiplication between: - * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer - * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer. - * Data type supported: Same as @p input. - * @param[in] fc_info (Optional) Fully connected layer additional info + * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension. + * Data type supported: Same as @p input. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED. + * @param[out] output Destination tensor. Its shape should be equal to the output of a matrix multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer. + * Data type supported: Same as @p input. + * @param[in] fc_info (Optional) Fully connected layer additional info + * @param[in] weights_info (Optional) Stores neccessary compute information when weights are already reshaped */ - void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, - FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + void configure(const ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), + const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEFullyConnectedLayer * - * @param[in] input Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] weights Weights tensor info. The weights must be 2 dimensional. - * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions. - * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension. - * Data type supported: Same as @p input. - * @param[in] biases Bias tensor. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED. - * @param[in] output Destination tensor info. Its shape should be equal to the output of a matrix multiplication between: - * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer - * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer. - * Data type supported: Same as @p input. - * @param[in] fc_info (Optional) Fully connected layer additional info + * Similar to @ref NEFullyConnectedLayer::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), + const WeightsInfo &weights_info = WeightsInfo()); + + /** Static function that queries whether fixed-format kernel exists for a given problem description + * + * @param[out] expected_weight_format Format in which weights should be for found fixed format kernel + * @param[in] input Source tensor + * @param[in] weights Weights tensor. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED. + * @param[in] output Destination tensor + * @param[in] fc_info Fully connected layer additional info + * @param[in] weights_info Describes weights shape * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, - FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const FullyConnectedLayerInfo &fc_info, + const WeightsInfo &weights_info); //Inherited methods override void run() override; void prepare() override; private: - void configure_fc_fc(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act); - void configure_conv_fc(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act); - void configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act); - - MemoryGroup _memory_group; - IWeightsManager *_weights_manager; - NEFlattenLayer _flatten; - NEConvertFullyConnectedWeights _convert_weights; - weights_transformations::NEConvertFullyConnectedWeightsManaged _convert_weights_managed; - NETranspose _reshape_weights_function; - weights_transformations::NEFullyConnectedLayerReshapeWeightsManaged _reshape_weights_managed_function; - NEGEMM _mm_gemm; - NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp; - Tensor _flatten_output; - Tensor _converted_weights_output; - Tensor _reshape_weights_output; - const ITensor *_original_weights; - bool _are_weights_converted; - bool _are_weights_reshaped; - bool _is_fc_after_conv; - bool _is_quantized_asymmetric; - bool _is_prepared; + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h b/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h index 3dd7f49044..f53b3de7f6 100644 --- a/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h +++ b/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h @@ -75,9 +75,16 @@ public: * @param[in] epsilon (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f. * @param[in] fbn_type (Optional) Fused batch normalization type. Defaults to Convolution. */ - void configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, ITensor *fused_weights, ITensor *fused_bias, - const ITensor *input_bias = nullptr, const ITensor *bn_beta = nullptr, const ITensor *bn_gamma = nullptr, - float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); + void configure(const ITensor *input_weights, + const ITensor *bn_mean, + const ITensor *bn_var, + ITensor *fused_weights, + ITensor *fused_bias, + const ITensor *input_bias = nullptr, + const ITensor *bn_beta = nullptr, + const ITensor *bn_gamma = nullptr, + float epsilon = 0.001f, + FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); /** Static function to check if given info will lead to a valid configuration of @ref NEFuseBatchNormalization * * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC @@ -95,10 +102,16 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var, - const ITensorInfo *fused_weights, const ITensorInfo *fused_bias, - const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr, - float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); + static Status validate(const ITensorInfo *input_weights, + const ITensorInfo *bn_mean, + const ITensorInfo *bn_var, + const ITensorInfo *fused_weights, + const ITensorInfo *fused_bias, + const ITensorInfo *input_bias = nullptr, + const ITensorInfo *bn_beta = nullptr, + const ITensorInfo *bn_gamma = nullptr, + float epsilon = 0.001f, + FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h index 6fa30bd545..29650a5eca 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMM.h +++ b/arm_compute/runtime/NEON/functions/NEGEMM.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,44 +24,18 @@ #ifndef ARM_COMPUTE_NEGEMM_H #define ARM_COMPUTE_NEGEMM_H -#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/function_info/GEMMInfo.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/IWeightsManager.h" -#include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" -#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" -#include "arm_compute/runtime/Tensor.h" #include <memory> namespace arm_compute { -// Forward declarations -class NEGEMMInterleave4x4Kernel; -class NEGEMMMatrixAdditionKernel; -class NEGEMMMatrixMultiplyKernel; -class NEGEMMTranspose1xWKernel; -namespace cpu -{ -class CpuGemmAssemblyDispatch; -} - /** Basic function to execute GEMM. This function calls the following kernels: * - * If optimized assembly is available: - * -# @ref cpu::CpuGemmAssemblyDispatch - * -# @ref NEActivationLayer (if alpha != 1.0) - * Else: - * -# @ref NEGEMMInterleave4x4Kernel (if the output tensor is a matrix) - * -# @ref NEGEMMTranspose1xWKernel (if the output tensor is a matrix) - * -# @ref NEGEMMMatrixMultiplyKernel - * In both cases: - * -# @ref NEGEMMMatrixAdditionKernel (if c != nullptr and beta != 0.0 and is not reshaped once) - * Else: - * -# @ref NEArithmeticAddition (if c != nullptr and is reshaped once and not optimized assembly in place) - * - * -# @ref NEActivationLayer (if activation is specified in GEMMInfo) + * -# @ref cpu::CpuGemm */ class NEGEMM : public IFunction { @@ -93,6 +67,8 @@ public: * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C]. * @note GEMM: The tensors a, b, c, d must have the same data type. You should not mix data types when calling this function. * + * @note Batched GEMM only supports broadcasting cases where RHS rank < LHS rank but not the other way around + * * @param[in] a First input tensor (Matrix A or Vector A). Data type supported: BFLOAT16/F16/F32 * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a * @param[in] c Third input tensor (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a @@ -102,51 +78,49 @@ public: * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and * if the reshape of matrix B should happen only for the first run */ - void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo()); + void configure(const ITensor *a, + const ITensor *b, + const ITensor *c, + ITensor *d, + float alpha, + float beta, + const GEMMInfo &gemm_info = GEMMInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMM. * - * @param[in] a First input tensor info (Matrix or Vector A). Data types supported: BFLOAT16/F16/F32 - * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a. - * @param[in] c Third input tensor info (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a. - * @param[out] output Output tensor info. Data type supported: same as @p a - * @param[in] alpha Weight of the matrix product - * @param[in] beta Weight of matrix C - * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and - * if the reshape of matrix B should happen only for the first run + * Similar to @ref NEGEMM::configure() * * @return a status */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo()); + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info = GEMMInfo()); + + /** Static function that queries whether there exists fixed-format kernel and if it exists it will return in the first argument in what format + * weights are expected to be reshaped as defined by WeightFormat class. Apart from the first argument the rest of the arguments are the same + * as in @ref NEGEMM::validate() except that all arguments are required. + * + * @return a status + */ + static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info = GEMMInfo()); // Inherited methods overridden: void run() override; void prepare() override; private: - MemoryGroup _memory_group; - IWeightsManager *_weights_manager; - std::unique_ptr<NEGEMMInterleave4x4Kernel> _interleave_kernel; - std::unique_ptr<NEGEMMTranspose1xWKernel> _transpose_kernel; - std::unique_ptr<NEGEMMMatrixMultiplyKernel> _mm_kernel; - std::unique_ptr<cpu::CpuGemmAssemblyDispatch> _asm_glue; - std::unique_ptr<NEGEMMMatrixAdditionKernel> _ma_kernel; - NEActivationLayer _alpha_scale_func; - NEArithmeticAddition _add_bias; - NEActivationLayer _activation_func; - - Tensor _tmp_a; - Tensor _tmp_b; - Tensor _tmp_d; - const ITensor *_original_b; - bool _run_vector_matrix_multiplication; - bool _run_alpha_scale; - bool _run_addition; - bool _run_bias_addition; - bool _run_activation; - bool _reshape_b_only_on_first_run; - bool _is_prepared; - - ITensorPack _asm_glue_tensors{}; + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute #endif /*ARM_COMPUTE_NEGEMM_H */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h index f39ce4dfa3..d1c5a1c9b3 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h @@ -29,15 +29,12 @@ #include "arm_compute/runtime/IMemoryManager.h" #include <memory> + namespace arm_compute { // Forward declarations class ITensor; class ITensorInfo; -namespace cpu -{ -class CpuGemmAssemblyDispatch; -} /** Basic function to compute the convolution layer. This function calls the following kernels/functions: * @@ -89,7 +86,8 @@ public: * Data types supported: Same as @p input. * @param[in] info Convolution layer descriptor */ - void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info); + void + configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConv2d * * @param[in] input Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], @@ -105,7 +103,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const Conv2dInfo &info); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h index e89eae1d31..3e84c3e2cf 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,147 +24,31 @@ #ifndef ARM_COMPUTE_NEGEMMCONVOLUTIONLAYER_H #define ARM_COMPUTE_NEGEMMCONVOLUTIONLAYER_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/IWeightsManager.h" #include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEGEMM.h" -#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" -#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" -#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" -#include "arm_compute/runtime/Tensor.h" #include <memory> namespace arm_compute { class ITensor; -class NECol2ImKernel; -class NEIm2ColKernel; -class NEWeightsReshapeKernel; - -/** Function to reshape the weights. This function calls the following kernel: - * -# @ref NEWeightsReshapeKernel - */ -class NEConvolutionLayerReshapeWeights : public IFunction -{ -public: - /** Constructor */ - NEConvolutionLayerReshapeWeights() noexcept; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEConvolutionLayerReshapeWeights(const NEConvolutionLayerReshapeWeights &) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEConvolutionLayerReshapeWeights(NEConvolutionLayerReshapeWeights &&) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEConvolutionLayerReshapeWeights &operator=(const NEConvolutionLayerReshapeWeights &) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEConvolutionLayerReshapeWeights &operator=(NEConvolutionLayerReshapeWeights &&) = delete; - /** Default destructor */ - ~NEConvolutionLayerReshapeWeights(); - /** Set the input and output tensors. - * - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. - * Data type supported: All. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: same as @p weights. - * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. - * @param[out] output Destination tensor. Data types supported: same as @p weights. - */ - void configure(const ITensor *weights, const ITensor *biases, ITensor *output); - /** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayerReshapeWeights - * - * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. - * Data type supported: All. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: same as @p weights. - * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. - * @param[in] output Destination tensor. Data types supported: same as @p weights. - * - * @return an error status - */ - static Status validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output); - - // Inherited methods overridden: - void run() override; - -private: - std::unique_ptr<NEWeightsReshapeKernel> _weights_reshape_kernel; -}; - -namespace weights_transformations -{ -/** Basic function to manage the reshape weights generated from @ref NEConvolutionLayerReshapeWeights */ -class NEConvolutionLayerReshapeWeightsTransform : public ITransformWeights -{ -public: - /** Constructor */ - NEConvolutionLayerReshapeWeightsTransform() = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEConvolutionLayerReshapeWeightsTransform(const NEConvolutionLayerReshapeWeightsTransform &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEConvolutionLayerReshapeWeightsTransform &operator=(const NEConvolutionLayerReshapeWeightsTransform &) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEConvolutionLayerReshapeWeightsTransform(NEConvolutionLayerReshapeWeightsTransform &&) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEConvolutionLayerReshapeWeightsTransform &operator=(NEConvolutionLayerReshapeWeightsTransform &&) = delete; - /** Default destructor */ - ~NEConvolutionLayerReshapeWeightsTransform() = default; - void configure(const ITensor *input, const ITensor *biases) - { - _bias_bit = (biases != nullptr) ? 1 : 0; - _func.configure(input, biases, &_output); - } - - void run() override - { - _output.allocator()->allocate(); - _func.run(); - _reshape_run = true; - } - - ITensor *get_weights() override - { - return &_output; - } - - void release() override - { - _output.allocator()->free(); - } - - uint32_t uid() override - { - return ((0x8) | (_bias_bit << 7)); - } - - bool is_reshape_run() - { - return _reshape_run; - } - -private: - Tensor _output{}; - NEConvolutionLayerReshapeWeights _func{}; - int32_t _bias_bit{ 0 }; -}; -} // namespace weights_transformations +class ITensorInfo; /** Basic function to compute the convolution layer. This function calls the following kernels/functions: * - * -# @ref NEIm2ColKernel - * -# @ref NEGEMM (if the data type is BFLOAT16/FP16/FP32) - * -# @ref NEGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED) - * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if the data type is QASYMM8/QASYMM8_SIGNED) - * -# @ref NEArithmeticAddition (if biases != nullptr and we have a 1x1 convolution with the NHWC data layout) - * -# @ref NECol2ImKernel (if NCHW data layout) + * -# @ref cpu::CpuGemmConv2d * */ class NEGEMMConvolutionLayer : public IFunction { public: /** Constructor */ - NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr, IWeightsManager *weights_manager = nullptr); + NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr, + IWeightsManager *weights_manager = nullptr); /** Prevent instances of this class from being copied (As this class contains pointers) */ NEGEMMConvolutionLayer(const NEGEMMConvolutionLayer &) = delete; /** Prevent instances of this class from being moved (As this class contains non movable objects) */ @@ -192,118 +76,139 @@ public: * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | * - * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. - * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. - * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. - * Data types supported: Same as @p input. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. - * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported + * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported */ - void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(), - const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1); + void configure(const ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer * - * @param[in] input Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. - * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. - * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. - * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. - * @param[in] output Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. - * Data types supported: Same as @p input. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. - * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported + * @param[in] input Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. + * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * @param[in] output Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1); - - // Inherited methods overridden: - void run() override; - void prepare() override; - -private: - /** Configures the appropriate matrix multiply routine + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); + + /** Static function to check if there is an optimized version of + * GEMM available for the input parameters. * - * @param[in] input Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. - * @param[in] weights Weights tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. - * @param[out] output Output tensor. Data types supported: Same as @p input, - * except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. - * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1) - */ - void configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(), int gemm_3d_depth = 1); - /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines + * The method is intended to be used to find out the optimal + * memory layout to be used for the weights tensor when running + * variable weights execution. * - * @param[in] input Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. - * @param[in] weights Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. - * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. - * @param[in] output Output tensor info. Data types supported: Same as @p input, - * except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. - * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1) - * @param[in] skip_im2col (Optional) Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout. (Default to false) + * The user can query the database of optimised kernels in + * arm_gemm by specifying one of the enumerations of + * arm_compute::WeightFormat in the weight_format field of the input + * parameter weights_info. In case of success, the method + * writes the expected format in the output parameter + * expected_weight_format. The expected_weight_format can than be + * used in the configure method of the class for retrieving the + * best optimal kernel. * - * @return a status - */ - static Status validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(), - int gemm_3d_depth = 1, bool skip_im2col = false); - /** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref NEGEMMLowpMatrixMultiplyCore + * Use case one - query for a specific format: * - * @param[in] input_info Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. - * @param[in] weights_info Weights tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. - * @param[in] act_info Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. - * @param[in] gemm_3d_depth Depth of GEMM 3D - * @param[in] skip_im2col Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout + * WeightInfo weights_info(..., arm_compute::WeightFormat::OHWIo4, ...); // Set the value of the input query. + * if (NEGEMMConvolutionlayer::has_opt_impl(WeightFormat(), ...., weights_info, ...)) + * { + * auto conv = std::unique_ptr<NEGEMMConvolutionlayer>(); + * conv->configure(..., weights_info, ...); // uses the same WeightFormat the user wanted originally, OHWYo4. + * conv->run(...); + * } * - * @return a status + * Use case two - query for any format that would be optimal for the GEMM to execute: + * + * WeightInfo weights_info(..., arm_compute::WeightFormat::ANY, ...); // Set the value of the input query. + * arm_compute::WeightFormat expected_wf; + * if (NEGEMMConvolutionlayer::has_opt_impl(expected_wf, ...., weights_info, ...)) + * { + * auto conv = std::unique_ptr<NEGEMMConvolutionlayer>(); + * // ... code to convert the layout of the weights tensor to the layout returned by has_opt_impl + * WeightInfo new_weights_info(..., expected_wf, ...); // Set the value of the WeightFormat returned by has_opt_impl. + * conv->configure(..., new_weights_info, ...); + * conv->run(...); + * } + * + * Notice that a GEMM configured with a WeightFormat other than + * UNSPECIFIED will run GEMM with variable weights mode. + * + * @param[out] expected_weight_format The arm_compute::WeightFormat expected by the kernel. + * @param[in] src Source tensor info. + * @param[in] weights Weights tensor info. + * @param[in] biases Biases tensor info. Shared biases supported. + * @param[in] dst Destination tensor info. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] weights_info (optional) Specifies additional configuration parameters for the weights of the GEMM computation. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. And no activation (i.e. Linear) which is the default value. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * + * @return a Status */ - static Status validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col); + static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); + // Inherited methods overridden: + void run() override; + void prepare() override; private: - MemoryGroup _memory_group; - IWeightsManager *_weights_manager; - NEConvolutionLayerReshapeWeights _reshape_weights; - weights_transformations::NEConvolutionLayerReshapeWeightsTransform _reshape_weights_managed; - std::unique_ptr<NEIm2ColKernel> _im2col_kernel; - NEGEMM _mm_gemm; - NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp; - std::unique_ptr<NECol2ImKernel> _col2im_kernel; - NEReshapeLayer _reshape_layer; - - const ITensor *_original_weights; - const ITensor *_original_output; - - Tensor _im2col_output; - Tensor _weights_reshaped; - Tensor _gemm_output; - Tensor _gemm_output_3d; - Tensor _tmp_output; - - DataLayout _data_layout; - - bool _skip_im2col; - bool _skip_col2im; - bool _is_quantized; - bool _is_prepared; + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute -#endif /* ARM_COMPUTE_NECONVOLUTIONGEMMLAYER_H */ +#endif /* ARM_COMPUTE_NEGEMMCONVOLUTIONLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h index dc9783f9eb..6d07675d3d 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,53 +21,34 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H -#define ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEGEMMLOWPMATRIXMULTIPLYCORE_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEGEMMLOWPMATRIXMULTIPLYCORE_H -#include "NEActivationLayer.h" -#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/function_info/GEMMInfo.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/IWeightsManager.h" -#include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/Tensor.h" #include <memory> namespace arm_compute { class ITensor; -class NEConvertQuantizedSignednessKernel; -class NEGEMMInterleave4x4Kernel; -class NEGEMMLowpMatrixMultiplyKernel; -class NEGEMMLowpOffsetContributionKernel; -class NEGEMMLowpOffsetContributionOutputStageKernel; -class NEGEMMLowpMatrixAReductionKernel; -class NEGEMMLowpMatrixBReductionKernel; -class NEGEMMTranspose1xWKernel; -namespace cpu -{ -class CpuGemmAssemblyDispatch; -} +class ITensorInfo; -/** Basic function to execute GEMMLowpMatrixMultiplyCore. This function calls the following kernels if the DOT product instruction is not available: - * - * -# @ref NEGEMMInterleave4x4Kernel - * -# @ref NEGEMMTranspose1xWKernel - * -# @ref NEGEMMLowpMatrixMultiplyKernel - * -# @ref NEGEMMLowpOffsetContributionKernel - * -# @ref NEActivationLayer +/** Function to run Gemm on quantized types. * - * otherwise if the DOT product instruction is available: + * This function calls the following: * - * -# @ref NEGEMMLowpOffsetContributionKernel - * -*/ + * -# @ref cpu::CpuGemmLowpMatrixMultiplyCore + */ class NEGEMMLowpMatrixMultiplyCore : public IFunction { public: /** Constructor */ - NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr); + NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr, + IWeightsManager *weights_manager = nullptr); /** Prevent instances of this class from being copied (As this class contains pointers) */ NEGEMMLowpMatrixMultiplyCore(const NEGEMMLowpMatrixMultiplyCore &) = delete; /** Default move constructor */ @@ -99,6 +80,7 @@ public: * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |S32 | * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |S32 | * |QASYMM8_SIGNED |QSYMM8 |S32 |S32 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |F32 |F32 | * * @note GEMM_LOWP: low precision GEMM kernel * This kernel performs the following computations: @@ -107,71 +89,36 @@ public: * -# Convert b values from QASYMM8 to int32 add b_offset to each of them. * -# Compute the matrix product of the resulting a * b in int32. * - * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise + * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED/F32 otherwise * * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED. * @param[in] b Second input tensor (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL. - * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported: S32 - * @param[out] output Output tensor. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED + * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported: S32/F32 + * @param[out] output Output tensor. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED/F32 * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and * if the reshape of matrix B should be executed only for the first run */ - void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo()); + void configure( + const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyCore * - * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise - * - * @param[in] a First input tensor info (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED. - * @param[in] b Second input tensor info (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL. - * @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type supported: S32 - * @param[in] output Output tensor info. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED - * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and - * if the reshape of matrix B should be executed only for the first run + * Similar to @ref NEGEMMLowpMatrixMultiplyCore::configure() * * @return a status */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo()); + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info = GEMMInfo()); // Inherited methods overridden void run() override; void prepare() override; private: - MemoryGroup _memory_group; - IWeightsManager *_weights_manager; - std::unique_ptr<cpu::CpuGemmAssemblyDispatch> _asm_glue; - std::unique_ptr<NEGEMMLowpMatrixMultiplyKernel> _mm_kernel; - std::unique_ptr<NEGEMMInterleave4x4Kernel> _mtx_a_reshape_kernel; - std::unique_ptr<NEGEMMTranspose1xWKernel> _mtx_b_reshape_kernel; - std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _mtx_a_reduction_kernel; - std::unique_ptr<NEGEMMLowpMatrixBReductionKernel> _mtx_b_reduction_kernel; - std::unique_ptr<NEGEMMLowpOffsetContributionKernel> _offset_contribution_kernel; - std::unique_ptr<NEGEMMLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel; - NEActivationLayer _activation_func; - std::unique_ptr<NEConvertQuantizedSignednessKernel> _convert_to_signed_asymm; - std::unique_ptr<NEConvertQuantizedSignednessKernel> _convert_from_signed_asymm; - - Tensor _vector_sum_col; - Tensor _vector_sum_row; - Tensor _tmp_a; - Tensor _tmp_b; - Tensor _mm_result_s32; - Tensor _signed_a; - Tensor _signed_output; - const ITensor *_original_b; - int32_t _a_offset; - int32_t _b_offset; - - bool _run_vector_matrix_multiplication; - bool _assembly_path; - bool _fused_assembly_path; - bool _reshape_b_only_on_first_run; - bool _is_prepared; - bool _fuse_output_stage; - bool _run_activation; - bool _flip_signedness; - - ITensorPack _asm_glue_tensors{}; + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute -#endif /*ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEGEMMLOWPMATRIXMULTIPLYCORE_H diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h index fa5f5e3826..0d932bb4af 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h @@ -25,7 +25,7 @@ #define ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H #include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +#include "arm_compute/runtime/IFunction.h" /** This file contains all available output stages for GEMMLowp. * @@ -39,237 +39,17 @@ namespace arm_compute { class ITensor; class ITensorInfo; - -/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint. - * - * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint depends on 3 parameters: - * - * result_fixedpoint_multiplier, result_shift, result_offset_after_shift - * - * The final result is: - * - * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift - * - * where FixedPointMul(x, y) is the nearest integer to the following - * mathematical expression, evaluated without overflow or intermediate rounding: - * - * (x * y) / 2^31 - * - * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68 - * - * In case the bias tensor is provided, the final result is: - * - * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift - * - * This function calls the following kernels: - * - * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel - * - * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions - * after the result is shifted right by result_shift -*/ -class NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint : public INESimpleFunctionNoBorder -{ -public: - /** Constructor */ - NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint() = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &operator=(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &&) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &operator=(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint &&) = delete; - /** Default destructor */ - ~NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint(); - /** Initialise the kernel's inputs, output - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8 - * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add - * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication - * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8 - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - */ - void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); - /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint - * - * @param[in] input Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8 - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); -}; -/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint. - * - * NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint depends on 3 parameters: - * - * result_fixedpoint_multiplier, result_shift, result_offset_after_shift - * - * The final result is: - * - * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift - * - * where FixedPointMul(x, y) is the nearest integer to the following - * mathematical expression, evaluated without overflow or intermediate rounding: - * - * (x * y) / 2^31 - * - * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68 - * - * In case the bias tensor is provided, the final result is: - * - * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift - * - * This function calls the following kernels: - * - * -# @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel - * - * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions - * after the result is shifted right by result_shift -*/ -class NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint : public INESimpleFunctionNoBorder -{ -public: - /** Constructor */ - NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint() = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint(const NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &operator=(const NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &&) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &operator=(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint &&) = delete; - /** Default destructor */ - ~NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint(); - /** Initialise the kernel's inputs, output - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED - * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add - * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication - * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8_SIGNED - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED, - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - */ - void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, - int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); - /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint - * - * @param[in] input Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED, - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); -}; -/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint. - * - * NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint depends on 2 parameters: - * - * result_fixedpoint_multiplier, result_shift - * - * The final result is: - * - * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) - * - * where FixedPointMul(x, y) is the nearest integer to the following - * mathematical expression, evaluated without overflow or intermediate rounding: - * - * (x * y) / 2^31 - * - * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68 - * - * In case the bias tensor is provided, the final result is: - * - * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift - * - * This function calls the following kernels: - * - * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel - * - * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions - * after the result is shifted right by result_shift -*/ -class NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint : public INESimpleFunctionNoBorder -{ -public: - /** Constructor */ - NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint() = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint(const NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &operator=(const NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &&) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &operator=(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint &&) = delete; - /** Default destructor */ - ~NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint(); - /** Initialise the kernel's inputs, output - * - * @param[in] input Input tensor. Data type supported: S32 - * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] output Output tensor. Data type supported: Data type supported: QSYMM16 - * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add - * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16. - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - */ - void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int min = std::numeric_limits<int32_t>::lowest(), - int max = std::numeric_limits<int32_t>::max()); - /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint - * - * @param[in] input Input tensor info. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32 - * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[in] output Output tensor info. Data type supported: Data type supported: QSYMM16 - * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer. - * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QSYMM16, - * Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer. - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max()); -}; - /** Basic function to execute GEMMLowpQuantizeDown kernels. * - * This function calls the following kernels: + * This function calls the following operators: * - * -# @ref NEGEMMLowpQuantizeDownInt32ScaleKernel - * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel - * -# @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel - * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel + * -# @ref cpu::CpuGemmLowpOutputStage */ -class NEGEMMLowpOutputStage : public INESimpleFunctionNoBorder +class NEGEMMLowpOutputStage : public IFunction { public: /** Constructor */ - NEGEMMLowpOutputStage() = default; + NEGEMMLowpOutputStage(); /** Prevent instances of this class from being copied (As this class contains pointers) */ NEGEMMLowpOutputStage(const NEGEMMLowpOutputStage &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ @@ -309,7 +89,17 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const GEMMLowpOutputStageInfo &info); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute #endif /*ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H */ diff --git a/arm_compute/runtime/NEON/functions/NEGather.h b/arm_compute/runtime/NEON/functions/NEGather.h index 393a38ee4d..9c7ae0134d 100644 --- a/arm_compute/runtime/NEON/functions/NEGather.h +++ b/arm_compute/runtime/NEON/functions/NEGather.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -49,18 +49,17 @@ public: * |All |All | * * @param[in] input Source tensor. Supported tensor rank: up to 4. Data type supported: All - * @param[in] indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following type: U32/S32. Each value Must be in range [0, input.shape[@p axis]) + * @param[in] indices Indices tensor. Supported tensor rank: up to 3. Must be one of the following type: U32/S32. Each value must be in range [0, input.shape[@p axis]), otherwise the result will become unpredictable. + * @note The "axis" must be in the range [0, input.rank -1] when indices is a vector, and must be 1 when indices is a 2D or 3D tensor. * @param[out] output Destination tensor. Data type supported: Same as @p input * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 + * */ void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0); - /** Static function to check if given info will lead to a valid configuration of @ref NEGatherKernel + /** Static function to check if given info will lead to a valid configuration * - * @param[in] input Source tensor info. Supported tensor rank: up to 4. Data type supported: All - * @param[in] indices Indices tensor info. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value Must be in range [0, input.shape[@p axis]) - * @param[in] output Destination tensor info. Data type supported: Same as @p input - * @param[in] axis (Optional) The axis in @p input to gather @p indices from. Defaults to 0 + * Similar to @ref NEGather::configure() * * @return a status */ diff --git a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h index 3b683382ec..0f294fde22 100644 --- a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h +++ b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h @@ -95,7 +95,12 @@ public: * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the @ref GenerateProposalsInfo struct. * @note Proposals contains all the proposals. Of those, only the first num_valid_proposals are valid. */ - void configure(const ITensor *scores, const ITensor *deltas, const ITensor *anchors, ITensor *proposals, ITensor *scores_out, ITensor *num_valid_proposals, + void configure(const ITensor *scores, + const ITensor *deltas, + const ITensor *anchors, + ITensor *proposals, + ITensor *scores_out, + ITensor *num_valid_proposals, const GenerateProposalsInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref NEGenerateProposalsLayer @@ -112,7 +117,11 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out, + static Status validate(const ITensorInfo *scores, + const ITensorInfo *deltas, + const ITensorInfo *anchors, + const ITensorInfo *proposals, + const ITensorInfo *scores_out, const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info); diff --git a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h index bb0697072b..0bc57be09e 100644 --- a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h +++ b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h @@ -89,7 +89,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + float gamma = 1.0f, + float beta = 0.0f, + float epsilon = 1e-12f); // Inherited methods overridden: void run() override; @@ -103,5 +107,5 @@ private: Tensor _permuted_input; Tensor _permuted_output; }; -} +} // namespace arm_compute #endif /* ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h index 7f1a5e785e..8502cee5d2 100644 --- a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h +++ b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h @@ -97,5 +97,5 @@ private: std::unique_ptr<NEL2NormalizeLayerKernel> _normalize_kernel; Tensor _sumsq; }; -} +} // namespace arm_compute #endif /* ARM_COMPUTE_NEL2NORMALIZELAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayer.h b/arm_compute/runtime/NEON/functions/NELSTMLayer.h index 075fb4530a..629c5d10a0 100644 --- a/arm_compute/runtime/NEON/functions/NELSTMLayer.h +++ b/arm_compute/runtime/NEON/functions/NELSTMLayer.h @@ -25,7 +25,8 @@ #define ARM_COMPUTE_NELSTMLAYER_H #include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" +#include "arm_compute/runtime/common/LSTMParams.h" +#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h" @@ -35,7 +36,6 @@ #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h" #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h" #include "arm_compute/runtime/NEON/functions/NETranspose.h" -#include "arm_compute/runtime/common/LSTMParams.h" namespace arm_compute { @@ -104,13 +104,26 @@ public: * @param[in] projection_threshold The clipping threshold for the output from the projection layer, such that values are bound within [-proj_clip, proj_clip]. * If set to 0.0 then clipping is disabled. */ - void configure(const ITensor *input, - const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights, - const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights, - const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias, - const ITensor *output_state_in, const ITensor *cell_state_in, - ITensor *scratch_buffer, ITensor *output_state_out, ITensor *cell_state_out, ITensor *output, - const LSTMParams<ITensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f); + void configure(const ITensor *input, + const ITensor *input_to_forget_weights, + const ITensor *input_to_cell_weights, + const ITensor *input_to_output_weights, + const ITensor *recurrent_to_forget_weights, + const ITensor *recurrent_to_cell_weights, + const ITensor *recurrent_to_output_weights, + const ITensor *forget_gate_bias, + const ITensor *cell_bias, + const ITensor *output_gate_bias, + const ITensor *output_state_in, + const ITensor *cell_state_in, + ITensor *scratch_buffer, + ITensor *output_state_out, + ITensor *cell_state_out, + ITensor *output, + const LSTMParams<ITensor> &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold = 0.f, + float projection_threshold = 0.f); /** Static function to check if given info will lead to a valid configuration of @ref NELSTMLayer * @@ -151,13 +164,26 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in, - const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output, - const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f); + static Status validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_in, + const ITensorInfo *scratch_buffer, + const ITensorInfo *output_state_out, + const ITensorInfo *cell_state_out, + const ITensorInfo *output, + const LSTMParams<ITensorInfo> &lstm_params, + const ActivationLayerInfo &activation_info, + float cell_threshold = 0.f, + float projection_threshold = 0.f); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h index 2f0c753691..ae951669b3 100644 --- a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h +++ b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_NELSTMLAYERQUANTIZED_H #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/common/LSTMParams.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h" @@ -38,8 +39,6 @@ #include "arm_compute/runtime/NEON/functions/NESlice.h" #include "arm_compute/runtime/NEON/functions/NETranspose.h" -#include "arm_compute/runtime/common/LSTMParams.h" - namespace arm_compute { // Forward declarations @@ -50,7 +49,7 @@ class ITensor; * This function calls the following functions/kernels: * * -# @ref NEGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers - * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16 + * -# @ref NEGEMMLowpOutputStage Convert 32-bit integers into QSYMM16 * -# @ref NETranspose Matrix transpose * -# @ref NEConcatenateLayer Tensor concatenation * -# @ref NEActivationLayer Activation functions (tanh and logistic) @@ -104,11 +103,22 @@ public: * @param[out] output_state_out Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input. */ void configure(const ITensor *input, - const ITensor *input_to_input_weights, const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights, - const ITensor *recurrent_to_input_weights, const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights, - const ITensor *input_gate_bias, const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias, - ITensor *cell_state_in, const ITensor *output_state_in, - ITensor *cell_state_out, ITensor *output_state_out); + const ITensor *input_to_input_weights, + const ITensor *input_to_forget_weights, + const ITensor *input_to_cell_weights, + const ITensor *input_to_output_weights, + const ITensor *recurrent_to_input_weights, + const ITensor *recurrent_to_forget_weights, + const ITensor *recurrent_to_cell_weights, + const ITensor *recurrent_to_output_weights, + const ITensor *input_gate_bias, + const ITensor *forget_gate_bias, + const ITensor *cell_bias, + const ITensor *output_gate_bias, + ITensor *cell_state_in, + const ITensor *output_state_in, + ITensor *cell_state_out, + ITensor *output_state_out); /** Static function to check if given info will lead to a valid configuration of @ref NELSTMLayer * @@ -133,11 +143,22 @@ public: * @return a status */ static Status validate(const ITensorInfo *input, - const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out); + const ITensorInfo *input_to_input_weights, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_input_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *input_gate_bias, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out); // Inherited methods overridden: void run() override; @@ -147,30 +168,30 @@ private: MemoryGroup _memory_group; // Functions used - NEGEMMLowpMatrixMultiplyCore _gemmlowp; - NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint _output_stage; - NETranspose _transpose_weights; - NEConcatenateLayer _concat_input_weights; - NEConcatenateLayer _concat_recurrent_weights; - NEConcatenateLayer _concat_weights; - NEConcatenateLayer _concat_inputs; - NEConcatenateLayer _concat_bias; - NEActivationLayer _sigmoid_forget_gate; - NEActivationLayer _sigmoid_input_gate; - NEActivationLayer _sigmoid_output_gate; - NEActivationLayer _tanh_modulation_gate; - NEActivationLayer _tanh_output_state; - NEArithmeticAddition _add1; - NEArithmeticAddition _add2; - NEPixelWiseMultiplication _mul1; - NEPixelWiseMultiplication _mul2; - NEPixelWiseMultiplication _mul3; - NESlice _slice_input_tensor; - NESlice _slice_forget_tensor; - NESlice _slice_cell_tensor; - NESlice _slice_output_tensor; - NEDequantizationLayer _dequantize; - NEQuantizationLayer _quantize; + NEGEMMLowpMatrixMultiplyCore _gemmlowp; + NEGEMMLowpOutputStage _output_stage; + NETranspose _transpose_weights; + NEConcatenateLayer _concat_input_weights; + NEConcatenateLayer _concat_recurrent_weights; + NEConcatenateLayer _concat_weights; + NEConcatenateLayer _concat_inputs; + NEConcatenateLayer _concat_bias; + NEActivationLayer _sigmoid_forget_gate; + NEActivationLayer _sigmoid_input_gate; + NEActivationLayer _sigmoid_output_gate; + NEActivationLayer _tanh_modulation_gate; + NEActivationLayer _tanh_output_state; + NEArithmeticAddition _add1; + NEArithmeticAddition _add2; + NEPixelWiseMultiplication _mul1; + NEPixelWiseMultiplication _mul2; + NEPixelWiseMultiplication _mul3; + NESlice _slice_input_tensor; + NESlice _slice_forget_tensor; + NESlice _slice_cell_tensor; + NESlice _slice_output_tensor; + NEDequantizationLayer _dequantize; + NEQuantizationLayer _quantize; // Tensor pointers const ITensor *_input_to_input_weights; diff --git a/arm_compute/runtime/NEON/functions/NEMatMul.h b/arm_compute/runtime/NEON/functions/NEMatMul.h new file mode 100644 index 0000000000..58dd7a6f6b --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEMatMul.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL_H + +#include "arm_compute/core/Types.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" +#include "arm_compute/runtime/IFunction.h" + +#include <memory> + +namespace arm_compute +{ +/** Settings for MatMul Cpu implementation*/ +class CpuMatMulSettings +{ +public: + // get fast math flag + bool fast_math() const + { + return _fast_math; + } + // get fixed format flag + bool fixed_format() const + { + return _fixed_format; + } + // Set fast math flag + CpuMatMulSettings &fast_math(bool fmath) + { + _fast_math = fmath; + return *this; + } + // Set fixed format flag + CpuMatMulSettings &fixed_format(bool fixed_format) + { + _fixed_format = fixed_format; + return *this; + } + +private: + bool _fast_math{false}; + bool _fixed_format{false}; +}; + +// Forward declarations +class ITensor; +class ITensorInfo; +class MatMulInfo; +class Status; + +/** Basic function to run the following operators: + * + * -# @ref cpu::CpuMatMul + */ +class NEMatMul : public IFunction +{ +public: + /** Constructor */ + NEMatMul(); + /** Destructor */ + ~NEMatMul(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEMatMul(const NEMatMul &) = delete; + /** Default move constructor */ + NEMatMul(NEMatMul &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEMatMul &operator=(const NEMatMul &) = delete; + /** Default move assignment operator */ + NEMatMul &operator=(NEMatMul &&) = default; + /** Initialize + * + * Valid data layouts: + * - Any + * + * Valid data type configurations: + * |lhs |rhs |dst | + * |:--------------|:------------------|:--------------| + * |F32 |F32 |F32 | + * |F16 |F16 |F16 | + * |BFLOAT16 |BFLOAT16 |BFLOAT16 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |QASYMM8 |QASYMM8 |QASYMM8 | + * + * @param[in] lhs Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8. + * @param[in] rhs Right-hand side tensor info. Data types supported: same as @p lhs. + * @param[out] dst Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs. + * @param[in] info Contains MatMul operation information described in @ref MatMulInfo. + * @param[in] settings Contains flags for function level settings i.e fast math + * @param[in] act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions. + */ + void configure(ITensor *lhs, + ITensor *rhs, + ITensor *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref NEMatMul + * + * @param[in] lhs Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8. + * @param[in] rhs Right-hand side tensor info. Data types supported: same as @p lhs. + * @param[out] dst Output tensor info to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs. + * @param[in] info Contains MatMul operation information described in @ref MatMulInfo. + * @param[in] settings Contains flags for function level settings i.e fast math + * @param[in] act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions. + * + * @return Status + */ + static Status validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden + void run() override; + +private: + struct Impl; + std::unique_ptr<Impl> _impl; +}; +} // namespace arm_compute +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL_H diff --git a/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h index 41ea040457..e00fc4544f 100644 --- a/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h +++ b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,6 +26,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" + #include <memory> namespace arm_compute @@ -33,12 +34,10 @@ namespace arm_compute class ITensor; class ITensorInfo; class NEFill; -class NEMaxUnpoolingLayerKernel; /** Function to perform MaxUnpooling. This function calls the following kernels: * * -# @ref NEFill - * -# @ref NEMaxUnpoolingLayerKernel */ class NEMaxUnpoolingLayer : public IFunction { @@ -88,14 +87,18 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *indices, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info); // Inherited methods overridden: void run() override; private: - std::unique_ptr<NEFill> _fill_func; - std::unique_ptr<NEMaxUnpoolingLayerKernel> _unpooling_layer_kernel; + std::unique_ptr<NEFill> _fill_func; + struct Impl; + std::unique_ptr<Impl> _impl; }; -} +} // namespace arm_compute #endif /* ARM_COMPUTE_NEMAXUNPOOLINGLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h index fbe000445c..27e3fa674e 100644 --- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h +++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NENORMALIZATIONLAYER_H #define ARM_COMPUTE_NENORMALIZATIONLAYER_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h" @@ -88,16 +87,17 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info); // Inherited methods overridden: void run() override; private: - MemoryGroup _memory_group; /**< Function memory group */ - std::unique_ptr<NENormalizationLayerKernel> _norm_kernel; /**< Normalization layer kernel */ - NEPixelWiseMultiplication _multiply_f; /**< Pixel multiplication function */ - Tensor _input_squared; /**< The intermediate buffer which stores results of squaring input */ + MemoryGroup _memory_group; /**< Function memory group */ + std::unique_ptr<NENormalizationLayerKernel> _norm_kernel; /**< Normalization layer kernel */ + NEPixelWiseMultiplication _multiply_f; /**< Pixel multiplication function */ + Tensor _input_squared; /**< The intermediate buffer which stores results of squaring input */ }; -} +} // namespace arm_compute #endif /* ARM_COMPUTE_NENORMALIZATIONLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEPadLayer.h b/arm_compute/runtime/NEON/functions/NEPadLayer.h index 4aa6725496..494b1c0641 100644 --- a/arm_compute/runtime/NEON/functions/NEPadLayer.h +++ b/arm_compute/runtime/NEON/functions/NEPadLayer.h @@ -24,14 +24,14 @@ #ifndef ARM_COMPUTE_NEPADLAYER_H #define ARM_COMPUTE_NEPADLAYER_H +#include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h" #include "arm_compute/runtime/NEON/functions/NECopy.h" #include "arm_compute/runtime/NEON/functions/NEStridedSlice.h" #include "arm_compute/runtime/SubTensor.h" - -#include "arm_compute/core/Types.h" #include "arm_compute/runtime/Tensor.h" + #include <memory> namespace arm_compute @@ -82,7 +82,11 @@ public: * @param[in] mode (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT, * or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT). */ - void configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT); + void configure(ITensor *input, + ITensor *output, + const PaddingList &padding, + const PixelValue constant_value = PixelValue(), + const PaddingMode mode = PaddingMode::CONSTANT); /** Static function to check if given info will lead to a valid configuration of @ref NEPadLayer. * * @param[in] input Source tensor info. Data types supported: All. @@ -95,7 +99,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const PaddingList &padding, + const PixelValue constant_value = PixelValue(), + const PaddingMode mode = PaddingMode::CONSTANT); // Inherited methods overridden: void run() override; @@ -109,7 +117,10 @@ private: * specifies the front and the end padding in the i-th dimension. * @param[in] constant_value Constant value to be used for the padding */ - void configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value); + void configure_constant_mode(ITensor *input, + ITensor *output, + const PaddingList &padding, + const PixelValue constant_value); /** Configure functions for when reflect or symmetric padding is used. * * @param[in] input Source tensor. Data types supported: All. diff --git a/arm_compute/runtime/NEON/functions/NEPermute.h b/arm_compute/runtime/NEON/functions/NEPermute.h index c863fde0ac..2cef64764d 100644 --- a/arm_compute/runtime/NEON/functions/NEPermute.h +++ b/arm_compute/runtime/NEON/functions/NEPermute.h @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NEPERMUTE_H #define ARM_COMPUTE_NEPERMUTE_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" #include <memory> diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h index 4684c2d4b8..3d81bf6087 100644 --- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h +++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021 Arm Limited. + * Copyright (c) 2016-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,7 +24,9 @@ #ifndef ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H #define ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H +#include "arm_compute/core/Rounding.h" #include "arm_compute/core/Types.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/runtime/IFunction.h" #include <memory> @@ -93,7 +95,12 @@ public: * @param[in] rounding_policy Rounding policy. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, + void configure(const ITensor *input1, + const ITensor *input2, + ITensor *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication * @@ -120,7 +127,12 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: @@ -156,7 +168,10 @@ public: * @param[out] output The output tensor. Data types supported: same as @p input1. Number of channels: same as @p input1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(ITensor *input1, + ITensor *input2, + ITensor *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration of @ref NEComplexPixelWiseMultiplication * * @param[in] input1 An input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor). @@ -164,7 +179,10 @@ public: * @param[in] output The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h b/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h new file mode 100644 index 0000000000..09251f2a5f --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_NEPOOLING3DLAYER_H +#define ARM_COMPUTE_NEPOOLING3DLAYER_H + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" + +#include <memory> + +namespace arm_compute +{ +// Forward declarations +class ITensor; +class ITensorInfo; +class IMemoryManager; +/** Basic function to simulate a pooling 3d layer with the specified pooling operation. This function calls the following kernels: + * + * -# @ref cpu::CpuPool3d + */ +class NEPooling3dLayer : public IFunction +{ +public: + /** Constructor */ + NEPooling3dLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEPooling3dLayer(const NEPooling3dLayer &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEPooling3dLayer &operator=(const NEPooling3dLayer &) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + NEPooling3dLayer(NEPooling3dLayer &&) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + NEPooling3dLayer &operator=(NEPooling3dLayer &&) = delete; + /** Default destructor */ + ~NEPooling3dLayer(); + /** Set the input and output tensors. + * + * Valid data layouts: + * - NDHWC + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |F16 |F16 | + * |F32 |F32 | + * |QASYMM8 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED | + * + * @note Source tensor is padded with -inf for MAX pooling and 0 otherwise + * + * @param[in] input Source tensor. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED. + * @param[out] output Destination tensor. + * @param[in] pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo. + */ + void configure(const ITensor *input, ITensor *output, const Pooling3dLayerInfo &pool_info); + /** Static function to check if given info will lead to a valid configuration of @ref NEPooling3dLayer + * + * + * @param[in] input Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED. + * @param[in] output Destination tensor info. + * @param[in] pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info); + + // Inherited methods overridden: + void run() override; + +private: + struct Impl; + std::unique_ptr<Impl> _impl; +}; +} // namespace arm_compute +#endif /* ARM_COMPUTE_NEPOOLING3DLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h index 9398e1fce9..768ad0d818 100644 --- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h +++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h @@ -71,6 +71,8 @@ public: * |F32 |F32 | * * @note F16 is supported for pool sizes 2 and 3 only + * @note Source tensor is padded with -inf for MAX pooling and 0 otherwise + * Cases where pooling region is completely outside input tensor are only supported for floating point data type * * @param[in, out] input Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. * @param[out] output Destination tensor. Data types supported: Same as @p input. @@ -89,7 +91,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices = nullptr); // Inherited methods overridden: void run() override; @@ -98,5 +103,5 @@ private: struct Impl; std::unique_ptr<Impl> _impl; }; -} +} // namespace arm_compute #endif /* ARM_COMPUTE_NEPOOLINGLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h b/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h index 38e0c9f3ad..858e3299af 100644 --- a/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h +++ b/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h @@ -62,7 +62,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info); + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const PriorBoxLayerInfo &info); }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEPRIORBOXLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h index 7c2e9bc5a1..009a4e0911 100644 --- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h +++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,15 +25,17 @@ #define ARM_COMPUTE_NEQLSTMLAYER_H #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/common/LSTMParams.h" #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" #include "arm_compute/runtime/NEON/functions/NECopy.h" +#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h" +#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h" #include "arm_compute/runtime/NEON/functions/NETranspose.h" -#include "arm_compute/runtime/common/LSTMParams.h" #include <memory> @@ -43,8 +45,13 @@ namespace arm_compute class ITensor; class ITensorInfo; class NEQLSTMLayerNormalizationKernel; -class NEGEMMLowpMatrixAReductionKernel; - +namespace cpu +{ +namespace kernels +{ +class CpuGemmLowpMatrixAReductionKernel; +} // namespace kernels +} // namespace cpu /** Basic function to run @ref NEQLSTMLayer * * This function calls the following kernels: @@ -54,8 +61,8 @@ class NEGEMMLowpMatrixAReductionKernel; * -# @ref NEArithmeticSubtraction Elementwise subtraction * -# @ref NECopy Copy kernel for copying output_state_out to output * -# @ref NEGEMMLowpMatrixMultiplyCore Quantized matrix multiplication core. Accumulators are 32-bit integers - * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint Convert 32-bit integers into QSYMM16 - * -# @ref NEGEMMLowpMatrixAReductionKernel For precomputing effective biases to use + * -# @ref NEGEMMLowpOutputStage Convert 32-bit integers into QSYMM16 + * -# @ref cpu::kernels::CpuGemmLowpMatrixAReductionKernel For precomputing effective biases to use * -# @ref NEPixelWiseMultiplication Elementwise multiplication * -# @ref NETranspose Transpose function for reshaping the weights * */ @@ -123,12 +130,21 @@ public: * projection_threshold (Optional) The clipping threshold for the output from the projection layer, such that values are bound within * [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled. */ - void configure(const ITensor *input, - const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights, - const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights, - const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias, - const ITensor *cell_state_in, ITensor *output_state_in, - ITensor *cell_state_out, ITensor *output_state_out, ITensor *output, + void configure(const ITensor *input, + const ITensor *input_to_forget_weights, + const ITensor *input_to_cell_weights, + const ITensor *input_to_output_weights, + const ITensor *recurrent_to_forget_weights, + const ITensor *recurrent_to_cell_weights, + const ITensor *recurrent_to_output_weights, + const ITensor *forget_gate_bias, + const ITensor *cell_bias, + const ITensor *output_gate_bias, + const ITensor *cell_state_in, + ITensor *output_state_in, + ITensor *cell_state_out, + ITensor *output_state_out, + ITensor *output, const LSTMParams<ITensor> &lstm_params); /** Static function to check if given info will lead to a valid configuration of @ref NEQLSTMLayer @@ -173,12 +189,21 @@ public: * [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled. * @return a status */ - static Status validate(const ITensorInfo *input, - const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights, - const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights, - const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias, - const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in, - const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output, + static Status validate(const ITensorInfo *input, + const ITensorInfo *input_to_forget_weights, + const ITensorInfo *input_to_cell_weights, + const ITensorInfo *input_to_output_weights, + const ITensorInfo *recurrent_to_forget_weights, + const ITensorInfo *recurrent_to_cell_weights, + const ITensorInfo *recurrent_to_output_weights, + const ITensorInfo *forget_gate_bias, + const ITensorInfo *cell_bias, + const ITensorInfo *output_gate_bias, + const ITensorInfo *cell_state_in, + const ITensorInfo *output_state_in, + const ITensorInfo *cell_state_out, + const ITensorInfo *output_state_out, + const ITensorInfo *output, const LSTMParams<ITensorInfo> &lstm_params); // Inherited methods overridden: @@ -211,10 +236,17 @@ private: * @param[in] mm_res_info Tensor info to be used to initialize output stage result tensor. * */ - void configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info, - const ITensor *mm_input, const ITensor *mm_weights, const ITensor *bias, Tensor *mm_res, - Tensor *outstage_res, float gemmlowp_scale, - const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info); + void configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, + NEGEMMLowpOutputStage &outstage, + GEMMLowpOutputStageInfo &gemmlowp_info, + const ITensor *mm_input, + const ITensor *mm_weights, + const ITensor *bias, + Tensor *mm_res, + Tensor *outstage_res, + float gemmlowp_scale, + const TensorInfo &mm_res_info, + const TensorInfo &outstage_tensor_info); MemoryGroup _memory_group; @@ -223,8 +255,8 @@ private: { static constexpr uint32_t max_dimension_supported = 2; - ITensor *_src{ nullptr }; - ITensor *_dst{ nullptr }; + ITensor *_src{nullptr}; + ITensor *_dst{nullptr}; size_t _row_size{}; Window _window{}; @@ -250,70 +282,73 @@ private: }; // Functions used - NETranspose _transpose_input_to_forget_weights; - NETranspose _transpose_input_to_cell_weights; - NETranspose _transpose_input_to_output_weights; - NETranspose _transpose_input_to_input_weights; - NETranspose _transpose_recurrent_to_forget_weights; - NETranspose _transpose_recurrent_to_cell_weights; - NETranspose _transpose_recurrent_to_output_weights; - NETranspose _transpose_recurrent_to_input_weights; - NETranspose _transpose_projection_weights; - std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_input_reduction; - std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_input_reduction; - std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_forget_reduction; - std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_forget_reduction; - std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_cell_reduction; - std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_cell_reduction; - std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _input_to_output_reduction; - std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _recurrent_to_output_reduction; - std::unique_ptr<NEGEMMLowpMatrixAReductionKernel> _projection_reduction; - NEArithmeticAddition _projection_bias_add; - NEGEMMLowpMatrixMultiplyCore _mm_input_to_forget; - NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget; - NEPixelWiseMultiplication _pixelwise_mul_cell_to_forget; - NEGEMMLowpOutputStage _input_to_forget_outstage; - NEGEMMLowpOutputStage _recurrent_to_forget_outstage; - NEGEMMLowpOutputStage _cell_to_forget_outstage; - NEArithmeticAddition _accumulate_input_recurrent_forget; - NEArithmeticAddition _accumulate_cell_forget; - NEActivationLayer _forget_gate_sigmoid; - NEGEMMLowpMatrixMultiplyCore _mm_input_to_cell; - NEGEMMLowpOutputStage _input_to_cell_outstage; - NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell; - NEGEMMLowpOutputStage _recurrent_to_cell_outstage; - NEArithmeticAddition _accumulate_input_recurrent_modulation; - NEActivationLayer _cell_gate_tanh; - NEArithmeticSubtraction _input_gate_sub; - NEGEMMLowpMatrixMultiplyCore _mm_input_to_input; - NEGEMMLowpOutputStage _input_to_input_outstage; - NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input; - NEGEMMLowpOutputStage _recurrent_to_input_outstage; - NEArithmeticAddition _accumulate_input_recurrent_input; - NEPixelWiseMultiplication _pixelwise_mul_cell_to_input; - NEGEMMLowpOutputStage _cell_to_input_outstage; - NEArithmeticAddition _accumulate_cell_input; - NEActivationLayer _input_gate_sigmoid; - NEPixelWiseMultiplication _pixelwise_mul_forget_cell; - NEPixelWiseMultiplication _pixelwise_mul_input_cell; - NEArithmeticAddition _add_forget_cell; - NEActivationLayer _cell_clip; - NEGEMMLowpMatrixMultiplyCore _mm_input_to_output; - NEGEMMLowpOutputStage _input_to_output_outstage; - NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output; - NEGEMMLowpOutputStage _recurrent_to_output_outstage; - NEArithmeticAddition _accumulate_input_recurrent_output; - NEPixelWiseMultiplication _pixelwise_mul_cell_to_output; - NEGEMMLowpOutputStage _cell_to_output_outstage; - NEArithmeticAddition _accumulate_cell_to_output; - NEActivationLayer _output_gate_sigmoid; - NEActivationLayer _hidden_tanh; - NEPixelWiseMultiplication _pixelwise_mul_hidden; - NEGEMMLowpOutputStage _hidden_outstage; - NEGEMMLowpMatrixMultiplyCore _mm_projection; - NEGEMMLowpOutputStage _projection_outstage; - NEArithmeticAddition _accumulate_projection; - NEActivationLayer _projection_clip; + + NEDequantizationLayer _dequantize_input_to_forget_weights; + NEQuantizationLayer _quantize_input_to_forget_weights; + NETranspose _transpose_input_to_forget_weights; + NETranspose _transpose_input_to_cell_weights; + NETranspose _transpose_input_to_output_weights; + NETranspose _transpose_input_to_input_weights; + NETranspose _transpose_recurrent_to_forget_weights; + NETranspose _transpose_recurrent_to_cell_weights; + NETranspose _transpose_recurrent_to_output_weights; + NETranspose _transpose_recurrent_to_input_weights; + NETranspose _transpose_projection_weights; + std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_input_reduction; + std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_input_reduction; + std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_forget_reduction; + std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_forget_reduction; + std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_cell_reduction; + std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_cell_reduction; + std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_output_reduction; + std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_output_reduction; + std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _projection_reduction; + NEArithmeticAddition _projection_bias_add; + NEGEMMLowpMatrixMultiplyCore _mm_input_to_forget; + NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_forget; + NEPixelWiseMultiplication _pixelwise_mul_cell_to_forget; + NEGEMMLowpOutputStage _input_to_forget_outstage; + NEGEMMLowpOutputStage _recurrent_to_forget_outstage; + NEGEMMLowpOutputStage _cell_to_forget_outstage; + NEArithmeticAddition _accumulate_input_recurrent_forget; + NEArithmeticAddition _accumulate_cell_forget; + NEActivationLayer _forget_gate_sigmoid; + NEGEMMLowpMatrixMultiplyCore _mm_input_to_cell; + NEGEMMLowpOutputStage _input_to_cell_outstage; + NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_cell; + NEGEMMLowpOutputStage _recurrent_to_cell_outstage; + NEArithmeticAddition _accumulate_input_recurrent_modulation; + NEActivationLayer _cell_gate_tanh; + NEArithmeticSubtraction _input_gate_sub; + NEGEMMLowpMatrixMultiplyCore _mm_input_to_input; + NEGEMMLowpOutputStage _input_to_input_outstage; + NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_input; + NEGEMMLowpOutputStage _recurrent_to_input_outstage; + NEArithmeticAddition _accumulate_input_recurrent_input; + NEPixelWiseMultiplication _pixelwise_mul_cell_to_input; + NEGEMMLowpOutputStage _cell_to_input_outstage; + NEArithmeticAddition _accumulate_cell_input; + NEActivationLayer _input_gate_sigmoid; + NEPixelWiseMultiplication _pixelwise_mul_forget_cell; + NEPixelWiseMultiplication _pixelwise_mul_input_cell; + NEArithmeticAddition _add_forget_cell; + NEActivationLayer _cell_clip; + NEGEMMLowpMatrixMultiplyCore _mm_input_to_output; + NEGEMMLowpOutputStage _input_to_output_outstage; + NEGEMMLowpMatrixMultiplyCore _mm_recurrent_to_output; + NEGEMMLowpOutputStage _recurrent_to_output_outstage; + NEArithmeticAddition _accumulate_input_recurrent_output; + NEPixelWiseMultiplication _pixelwise_mul_cell_to_output; + NEGEMMLowpOutputStage _cell_to_output_outstage; + NEArithmeticAddition _accumulate_cell_to_output; + NEActivationLayer _output_gate_sigmoid; + NEActivationLayer _hidden_tanh; + NEPixelWiseMultiplication _pixelwise_mul_hidden; + NEGEMMLowpOutputStage _hidden_outstage; + NEGEMMLowpMatrixMultiplyCore _mm_projection; + NEGEMMLowpOutputStage _projection_outstage; + NEArithmeticAddition _accumulate_projection; + NEActivationLayer _projection_clip; TensorCopyKernel _projection_bias_copy; TensorCopyKernel _projection_output_to_accumulate_copy; @@ -325,19 +360,16 @@ private: NECopy _copy_output; // Tensor pointers - const ITensor *_input_to_input_weights - { - nullptr - }; - const ITensor *_recurrent_to_input_weights{ nullptr }; - const ITensor *_projection_bias{ nullptr }; - const ITensor *_input_to_forget_weights{ nullptr }; - const ITensor *_input_to_cell_weights{ nullptr }; - const ITensor *_input_to_output_weights{ nullptr }; - const ITensor *_recurrent_to_forget_weights{ nullptr }; - const ITensor *_recurrent_to_cell_weights{ nullptr }; - const ITensor *_recurrent_to_output_weights{ nullptr }; - const ITensor *_projection_weights{ nullptr }; + const ITensor *_input_to_input_weights{nullptr}; + const ITensor *_recurrent_to_input_weights{nullptr}; + const ITensor *_projection_bias{nullptr}; + const ITensor *_input_to_forget_weights{nullptr}; + const ITensor *_input_to_cell_weights{nullptr}; + const ITensor *_input_to_output_weights{nullptr}; + const ITensor *_recurrent_to_forget_weights{nullptr}; + const ITensor *_recurrent_to_cell_weights{nullptr}; + const ITensor *_recurrent_to_output_weights{nullptr}; + const ITensor *_projection_weights{nullptr}; std::array<const ITensor *, _layer_norm_count> _layer_norm_weights{}; std::array<const ITensor *, _layer_norm_count> _layer_norm_bias{}; @@ -372,63 +404,66 @@ private: return _layer_norms[getGateIndex(g)]; } - void configure_layer_norm(LayerNormGate g, const ITensor *in); + void configure_layer_norm(LayerNormGate g, const ITensor *in); static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias); // Temporary tensors - Tensor _input_to_forget_weights_transposed{ nullptr }; - Tensor _input_to_cell_weights_transposed{ nullptr }; - Tensor _input_to_output_weights_transposed{ nullptr }; - Tensor _input_to_input_weights_transposed{ nullptr }; - Tensor _recurrent_to_forget_weights_transposed{ nullptr }; - Tensor _recurrent_to_cell_weights_transposed{ nullptr }; - Tensor _recurrent_to_output_weights_transposed{ nullptr }; - Tensor _recurrent_to_input_weights_transposed{ nullptr }; - Tensor _projection_weights_transposed{ nullptr }; - Tensor _input_to_input_eff_bias{ nullptr }; - Tensor _recurrent_to_input_eff_bias{ nullptr }; - Tensor _input_to_forget_eff_bias{ nullptr }; - Tensor _recurrent_to_forget_eff_bias{ nullptr }; - Tensor _input_to_cell_eff_bias{ nullptr }; - Tensor _recurrent_to_cell_eff_bias{ nullptr }; - Tensor _input_to_output_eff_bias{ nullptr }; - Tensor _recurrent_to_output_eff_bias{ nullptr }; - Tensor _projection_reduction_res{ nullptr }; - Tensor _projection_eff_bias{ nullptr }; - Tensor _mm_input_to_forget_res{ nullptr }; - Tensor _mm_recurrent_to_forget_res{ nullptr }; - Tensor _mul_cell_to_forget_res{ nullptr }; - Tensor _input_to_forget_outstage_res{ nullptr }; - Tensor _cell_to_forget_outstage_res{ nullptr }; - Tensor _recurrent_to_forget_outstage_res{ nullptr }; - Tensor _forget_gate{ nullptr }; - Tensor _mm_input_to_cell_res{ nullptr }; - Tensor _input_to_cell_outstage_res{ nullptr }; - Tensor _mm_recurrent_to_cell_res{ nullptr }; - Tensor _recurrent_to_cell_outstage_res{ nullptr }; - Tensor _cell_gate{ nullptr }; - Tensor _mul_input_cell_res{ nullptr }; - Tensor _mm_input_to_input_res{ nullptr }; - Tensor _input_to_input_outstage_res{ nullptr }; - Tensor _mm_recurrent_to_input_res{ nullptr }; - Tensor _mul_cell_to_input_res{ nullptr }; - Tensor _cell_to_input_outstage_res{ nullptr }; - Tensor _recurrent_to_input_outstage_res{ nullptr }; - Tensor _input_gate{ nullptr }; - Tensor _mm_input_to_output_res{ nullptr }; - Tensor _input_to_output_outstage_res{ nullptr }; - Tensor _mm_recurrent_to_output_res{ nullptr }; - Tensor _mul_cell_to_output_res{ nullptr }; - Tensor _cell_to_output_outstage_res{ nullptr }; - Tensor _recurrent_to_output_outstage_res{ nullptr }; - Tensor _output_gate{ nullptr }; - Tensor _hidden_mul_res{ nullptr }; - Tensor _hidden_gate{ nullptr }; - Tensor _mm_projection_res{ nullptr }; - Tensor _projection_outstage_res{ nullptr }; - Tensor _projection_out_res{ nullptr }; - Tensor _projection_accumulate_res{ nullptr }; - Tensor _ones{ nullptr }; + Tensor _input_to_forget_weights_f32{nullptr}; + Tensor _input_to_forget_weights_symm8{nullptr}; + + Tensor _input_to_forget_weights_transposed{nullptr}; + Tensor _input_to_cell_weights_transposed{nullptr}; + Tensor _input_to_output_weights_transposed{nullptr}; + Tensor _input_to_input_weights_transposed{nullptr}; + Tensor _recurrent_to_forget_weights_transposed{nullptr}; + Tensor _recurrent_to_cell_weights_transposed{nullptr}; + Tensor _recurrent_to_output_weights_transposed{nullptr}; + Tensor _recurrent_to_input_weights_transposed{nullptr}; + Tensor _projection_weights_transposed{nullptr}; + Tensor _input_to_input_eff_bias{nullptr}; + Tensor _recurrent_to_input_eff_bias{nullptr}; + Tensor _input_to_forget_eff_bias{nullptr}; + Tensor _recurrent_to_forget_eff_bias{nullptr}; + Tensor _input_to_cell_eff_bias{nullptr}; + Tensor _recurrent_to_cell_eff_bias{nullptr}; + Tensor _input_to_output_eff_bias{nullptr}; + Tensor _recurrent_to_output_eff_bias{nullptr}; + Tensor _projection_reduction_res{nullptr}; + Tensor _projection_eff_bias{nullptr}; + Tensor _mm_input_to_forget_res{nullptr}; + Tensor _mm_recurrent_to_forget_res{nullptr}; + Tensor _mul_cell_to_forget_res{nullptr}; + Tensor _input_to_forget_outstage_res{nullptr}; + Tensor _cell_to_forget_outstage_res{nullptr}; + Tensor _recurrent_to_forget_outstage_res{nullptr}; + Tensor _forget_gate{nullptr}; + Tensor _mm_input_to_cell_res{nullptr}; + Tensor _input_to_cell_outstage_res{nullptr}; + Tensor _mm_recurrent_to_cell_res{nullptr}; + Tensor _recurrent_to_cell_outstage_res{nullptr}; + Tensor _cell_gate{nullptr}; + Tensor _mul_input_cell_res{nullptr}; + Tensor _mm_input_to_input_res{nullptr}; + Tensor _input_to_input_outstage_res{nullptr}; + Tensor _mm_recurrent_to_input_res{nullptr}; + Tensor _mul_cell_to_input_res{nullptr}; + Tensor _cell_to_input_outstage_res{nullptr}; + Tensor _recurrent_to_input_outstage_res{nullptr}; + Tensor _input_gate{nullptr}; + Tensor _mm_input_to_output_res{nullptr}; + Tensor _input_to_output_outstage_res{nullptr}; + Tensor _mm_recurrent_to_output_res{nullptr}; + Tensor _mul_cell_to_output_res{nullptr}; + Tensor _cell_to_output_outstage_res{nullptr}; + Tensor _recurrent_to_output_outstage_res{nullptr}; + Tensor _output_gate{nullptr}; + Tensor _hidden_mul_res{nullptr}; + Tensor _hidden_gate{nullptr}; + Tensor _mm_projection_res{nullptr}; + Tensor _projection_outstage_res{nullptr}; + Tensor _projection_out_res{nullptr}; + Tensor _projection_accumulate_res{nullptr}; + Tensor _ones{nullptr}; std::array<Tensor, _layer_norm_count> _layer_norm_output{}; inline Tensor &get_layer_norm_output(LayerNormGate g) @@ -436,14 +471,15 @@ private: return _layer_norm_output[getGateIndex(g)]; } - bool _is_prepared{ false }; - bool _has_cifg{ false }; - bool _has_cell_clipping{ false }; - bool _has_projection{ false }; - bool _has_projection_clipping{ false }; - bool _has_peephole{ false }; - bool _has_layer_norm{ false }; - bool _projection_tensor_copy_required{ false }; + bool _is_prepared{false}; + bool _has_cifg{false}; + bool _has_cell_clipping{false}; + bool _has_projection{false}; + bool _has_projection_clipping{false}; + bool _has_peephole{false}; + bool _has_layer_norm{false}; + bool _projection_tensor_copy_required{false}; + bool _convert_input_to_forget_weights_to_qsymm8{false}; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEQLSTMLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NERNNLayer.h b/arm_compute/runtime/NEON/functions/NERNNLayer.h index 667d3144ac..af7f464ac9 100644 --- a/arm_compute/runtime/NEON/functions/NERNNLayer.h +++ b/arm_compute/runtime/NEON/functions/NERNNLayer.h @@ -72,7 +72,13 @@ public: * @param[in,out] hidden_state Output tensor of shape [num_units, batch_size]. Data types supported: Same as @p input * @param[in] info Activation layer parameter. */ - void configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, const ITensor *bias, ITensor *hidden_state, ITensor *output, ActivationLayerInfo &info); + void configure(const ITensor *input, + const ITensor *weights, + const ITensor *recurrent_weights, + const ITensor *bias, + ITensor *hidden_state, + ITensor *output, + ActivationLayerInfo &info); /** Initialize the function * * @param[in] input Input is a 2-D tensor of shape [input_size, batch_size]. Data types supported: F16/F32 @@ -85,7 +91,12 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state, const ITensorInfo *output, + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *recurrent_weights, + const ITensorInfo *bias, + const ITensorInfo *hidden_state, + const ITensorInfo *output, const ActivationLayerInfo &info); // Inherited methods overridden: diff --git a/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h b/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h index ea1af4daea..b06ebe899d 100644 --- a/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h +++ b/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h @@ -77,7 +77,10 @@ public: * * @return a Status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *rois, + ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info); }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEROIALIGNLAYER_H */ diff --git a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h index 2992b3eb95..929111ad4b 100644 --- a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h +++ b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h @@ -26,6 +26,7 @@ #include "arm_compute/core/IArray.h" #include "arm_compute/runtime/IFunction.h" + #include <memory> namespace arm_compute @@ -73,7 +74,8 @@ public: * @note The z dimensions of @p output tensor and @p input tensor must be the same. * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. */ - void configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info); + void + configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info); // Inherited methods overridden: void run() override; @@ -91,7 +93,10 @@ public: * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array. * @return a Status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *rois, + const ITensorInfo *output, + const ROIPoolingLayerInfo &pool_info); private: std::unique_ptr<NEROIPoolingLayerKernel> _roi_kernel; diff --git a/arm_compute/runtime/NEON/functions/NERange.h b/arm_compute/runtime/NEON/functions/NERange.h index cb14c8fdde..609456a4ef 100644 --- a/arm_compute/runtime/NEON/functions/NERange.h +++ b/arm_compute/runtime/NEON/functions/NERange.h @@ -26,6 +26,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" + #include <memory> namespace arm_compute diff --git a/arm_compute/runtime/NEON/functions/NEReduceMean.h b/arm_compute/runtime/NEON/functions/NEReduceMean.h index 7512115a3f..5b8d8cdf2b 100644 --- a/arm_compute/runtime/NEON/functions/NEReduceMean.h +++ b/arm_compute/runtime/NEON/functions/NEReduceMean.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,12 +24,9 @@ #ifndef ARM_COMPUTE_NEON_REDUCE_MEAN_H #define ARM_COMPUTE_NEON_REDUCE_MEAN_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h" -#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h" #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h" #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" #include "arm_compute/runtime/Tensor.h" @@ -83,7 +80,8 @@ public: * * @return A status */ - static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output); + static Status + validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output); // Inherited methods overridden: void run() override; @@ -93,13 +91,8 @@ private: std::vector<NEReductionOperation> _reduction_kernels; std::vector<Tensor> _reduced_outs; NEReshapeLayer _reshape; - NEDequantizationLayer _dequant; - NEQuantizationLayer _requant; int _reduction_ops; bool _keep_dims; - bool _do_requant; - Tensor _input_no_quant; - Tensor _output_no_quant; }; } // namespace arm_compute #endif /* ARM_COMPUTE_NEON_REDUCE_MEAN_H */ diff --git a/arm_compute/runtime/NEON/functions/NEReductionOperation.h b/arm_compute/runtime/NEON/functions/NEReductionOperation.h index 533c10adcf..f5391a6d0e 100644 --- a/arm_compute/runtime/NEON/functions/NEReductionOperation.h +++ b/arm_compute/runtime/NEON/functions/NEReductionOperation.h @@ -25,9 +25,9 @@ #define ARM_COMPUTE_NEREDUCTIONOPERATION_H #include "arm_compute/runtime/IFunction.h" - #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h" #include "arm_compute/runtime/Tensor.h" + #include <memory> namespace arm_compute @@ -88,7 +88,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims = true); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + unsigned int axis, + ReductionOperation op, + bool keep_dims = true); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NERemap.h b/arm_compute/runtime/NEON/functions/NERemap.h deleted file mode 100644 index 271ac9739b..0000000000 --- a/arm_compute/runtime/NEON/functions/NERemap.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_NEREMAP_H -#define ARM_COMPUTE_NEREMAP_H - -#include "arm_compute/core/Types.h" -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" -#include "arm_compute/runtime/Tensor.h" - -#include <cstdint> - -namespace arm_compute -{ -class ITensor; - -/** Basic function to execute remap. This function calls the following kernels: - * - * -# @ref NERemapKernel - */ -class NERemap : public INESimpleFunctionNoBorder -{ -public: - /** Initialise the function's sources, destination, interpolation policy and border mode. - * - * Valid data layouts: - * - All - * - * Valid data type configurations: - * |src0 |src1 |src2 |dst | - * |:------|:------|:------|:------| - * |U8 |F32 |F32 |U 8 | - * - * @param[in, out] input Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED) - * @param[in] map_x Map for X coordinates. Data type supported: F32. - * @param[in] map_y Map for Y coordinates. Data type supported: F32. - * @param[out] output Output tensor. Data type supported: U8. - * @param[in] policy Interpolation policy to use. Only NEAREST and BILINEAR are supported. - * @param[in] border_mode Border mode to use on the input tensor. - * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. Defaults to 0. - * - */ - void configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, - InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0); -}; -} -#endif /*ARM_COMPUTE_NEREMAP_H */ diff --git a/arm_compute/runtime/NEON/functions/NEReorderLayer.h b/arm_compute/runtime/NEON/functions/NEReorderLayer.h new file mode 100644 index 0000000000..e3fa7b9c16 --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEReorderLayer.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#if defined(__aarch64__) + +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREORDERLAYER +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREORDERLAYER + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" + +namespace arm_compute +{ +class ITensor; +class ITensorInfo; +class NEReorderKernel; +/** Function to compute blocked reorder. */ +class NEReorderLayer : public IFunction +{ +public: + /** Default constructor */ + NEReorderLayer(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEReorderLayer(const NEReorderLayer &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEReorderLayer &operator=(const NEReorderLayer &) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + NEReorderLayer(NEReorderLayer &&) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + NEReorderLayer &operator=(NEReorderLayer &&) = delete; + /** Default destructor */ + ~NEReorderLayer(); + /** Set the input and output tensors. + * + * Valid data layouts: + * - NCHW + * + * Valid data type configurations: + * |src |dst | + * |:--------|:---------| + * |F32 |F32 | + * + * @param[in] input Source tensor. Data type supported: F32. Data layouts supported: NCHW. + * @param[out] output Destination with the same dimensions, data type, data layout as @p input + * except last dimension of data layout which needs to be multiple of blocking parameter ksize + * @param[in] input_wf WeightFormat of input. + * @param[in] output_wf WeightFormat of output. + */ + void configure(const ITensor *input, + ITensor *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf); + + /** Static function to check if given info will lead to a valid configuration of @ref NEReorderLayer + * + * Similar to @ref NEReorderLayer::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + arm_compute::WeightFormat input_wf, + arm_compute::WeightFormat output_wf); + + // Inherited methods overridden: + void run() override; + +private: + std::unique_ptr<NEReorderKernel> _reorder_kernel; /**< Reorder layer kernel */ +}; +} // namespace arm_compute +#endif /* ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREORDERLAYER */ + +#endif // defined(__aarch64__) diff --git a/arm_compute/runtime/NEON/functions/NEReverse.h b/arm_compute/runtime/NEON/functions/NEReverse.h index c02fff54a5..e03e415068 100644 --- a/arm_compute/runtime/NEON/functions/NEReverse.h +++ b/arm_compute/runtime/NEON/functions/NEReverse.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,12 +21,11 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NEREVERSE_H -#define ARM_COMPUTE_NEREVERSE_H - -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREVERSE_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREVERSE_H #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" namespace arm_compute { @@ -45,22 +44,33 @@ public: * Valid data type configurations: * |src0 |src1 |dst | * |:--------------|:--------------|:--------------| - * |All |U32 |All | + * |All |U32, S32 |All | + * + * @param[in] input Input tensor. Data types supported: All + * @param[out] output Output tensor. Data type supported: Same as @p input + * @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32 + * @param[in] use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis + * + * @note The value of each axis should be between [-rank, rank) + * @note If there are duplicate values in the tensor, the subsequent axis values are ignored. e.g. an array of [2, 2] has the same effects as [2]. + * + * @deprecated Support for U32 in axis tensor will be removed in 24.02 release * - * @param[in] input Input tensor. Data types supported: All - * @param[out] output Output tensor. Data type supported: Same as @p input - * @param[in] axis Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32 */ - void configure(const ITensor *input, ITensor *output, const ITensor *axis); + void configure(const ITensor *input, ITensor *output, const ITensor *axis, const bool use_inverted_axis = false); /** Static function to check if given info will lead to a valid configuration of @ref NEReverseKernel * - * @param[in] input Input tensor info. Data types supported: All - * @param[in] output Output tensor info. Data type supported: Same as @p input - * @param[in] axis Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32 + * @param[in] input Input tensor info. Data types supported: All + * @param[in] output Output tensor info. Data type supported: Same as @p input + * @param[in] axis Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32/S32 + * @param[in] use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const ITensorInfo *axis, + const bool use_inverted_axis = false); }; } // namespace arm_compute -#endif /* ARM_COMPUTE_NEREVERSE_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREVERSE_H diff --git a/arm_compute/runtime/NEON/functions/NEScale.h b/arm_compute/runtime/NEON/functions/NEScale.h index 0b7dddacb2..72dfa3bda4 100644 --- a/arm_compute/runtime/NEON/functions/NEScale.h +++ b/arm_compute/runtime/NEON/functions/NEScale.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2021 Arm Limited. + * Copyright (c) 2016-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,10 +24,9 @@ #ifndef ARM_COMPUTE_NESCALEIMAGE_H #define ARM_COMPUTE_NESCALEIMAGE_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" #include <memory> @@ -66,16 +65,19 @@ public: * |F16 |F16 | * |F32 |F32 | * |U8 |U8 | + * |S8 |S8 | * |S16 |S16 | * - * @param[in, out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED) + * @param[in, out] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED) * @param[out] output Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. * @param[in] info @ref ScaleKernelInfo to be used for configuration + * + * @note Using S8 data type only supports NHWC, @p border_mode Replicate, and @p policy Bilinear */ void configure(ITensor *input, ITensor *output, const ScaleKernelInfo &info); /** Static function to check if given info will lead to a valid configuration of @ref NEScale * - * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED) + * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED) * @param[in] output Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. * @param[in] info @ref ScaleKernelInfo to be used for validation * diff --git a/arm_compute/runtime/NEON/functions/NESlice.h b/arm_compute/runtime/NEON/functions/NESlice.h index ac79a5c633..70a688d3b0 100644 --- a/arm_compute/runtime/NEON/functions/NESlice.h +++ b/arm_compute/runtime/NEON/functions/NESlice.h @@ -85,7 +85,8 @@ public: * * @return A status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends); // Inherited methods overridden: void run() override; @@ -129,7 +130,8 @@ public: * * @return A status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends); + static Status + validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends); }; } // namespace experimental } // namespace arm_compute diff --git a/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h b/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h index ad8c1467d0..5dee61a4a8 100644 --- a/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h +++ b/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h @@ -24,9 +24,9 @@ #ifndef ARM_COMPUTE_NESPACETOBATCHLAYER_H #define ARM_COMPUTE_NESPACETOBATCHLAYER_H +#include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/Types.h" #include <memory> namespace arm_compute @@ -82,7 +82,12 @@ public: * @param[in] padding_right The padding at the end of every dimension of the output tensor. * @param[out] output Tensor output. Data types supported: same as @p input */ - void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output); + void configure(const ITensor *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + ITensor *output); /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayer * * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All. @@ -92,7 +97,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + const ITensorInfo *block_shape, + const ITensorInfo *paddings, + const ITensorInfo *output); /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayer (Static block shape and paddings) * * @param[in] input Tensor input info. Supported tensor rank: 4. Data types supported: All. @@ -104,7 +112,12 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output); + static Status validate(const ITensorInfo *input, + const int block_shape_x, + const int block_shape_y, + const Size2D &padding_left, + const Size2D &padding_right, + const ITensorInfo *output); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NESplit.h b/arm_compute/runtime/NEON/functions/NESplit.h index 206f299c06..36358a7094 100644 --- a/arm_compute/runtime/NEON/functions/NESplit.h +++ b/arm_compute/runtime/NEON/functions/NESplit.h @@ -26,7 +26,6 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" - #include "arm_compute/runtime/CPP/functions/CPPSplit.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/NEON/functions/NESlice.h" diff --git a/arm_compute/runtime/NEON/functions/NEStackLayer.h b/arm_compute/runtime/NEON/functions/NEStackLayer.h index ae4e468f21..98dacde0c1 100644 --- a/arm_compute/runtime/NEON/functions/NEStackLayer.h +++ b/arm_compute/runtime/NEON/functions/NEStackLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NESTACKLAYER_H -#define ARM_COMPUTE_NESTACKLAYER_H +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESTACKLAYER_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESTACKLAYER_H #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" @@ -91,9 +91,8 @@ public: void run() override; private: - std::vector<ITensor *> _input; - std::vector<std::unique_ptr<NEStackLayerKernel>> _stack_kernels; - unsigned int _num_inputs; + std::unique_ptr<NEStackLayerKernel> _stack_kernel; + bool _is_prepared; }; } // namespace arm_compute -#endif /* ARM_COMPUTE_NESTACKLAYER_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESTACKLAYER_H diff --git a/arm_compute/runtime/NEON/functions/NEStridedSlice.h b/arm_compute/runtime/NEON/functions/NEStridedSlice.h index 4b14d946f6..fa1113ffec 100644 --- a/arm_compute/runtime/NEON/functions/NEStridedSlice.h +++ b/arm_compute/runtime/NEON/functions/NEStridedSlice.h @@ -71,9 +71,14 @@ public: * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1. * A slice of size 1 starting from starts[i] in the dimension must be preserved. */ - void configure(const ITensor *input, ITensor *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0); + void configure(const ITensor *input, + ITensor *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask = 0, + int32_t end_mask = 0, + int32_t shrink_axis_mask = 0); /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSlice * @@ -89,9 +94,14 @@ public: * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1. * A slice of size 1 starting from starts[i] in the dimension must be preserved. */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask = 0, + int32_t end_mask = 0, + int32_t shrink_axis_mask = 0); // Inherited methods overridden: void run() override; @@ -121,9 +131,14 @@ public: * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1. * A slice of size 1 starting from starts[i] in the dimension must be preserved. */ - void configure(const ITensorInfo *input, ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0); + void configure(const ITensorInfo *input, + ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask = 0, + int32_t end_mask = 0, + int32_t shrink_axis_mask = 0); /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSlice * @@ -139,9 +154,14 @@ public: * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1. * A slice of size 1 starting from starts[i] in the dimension must be preserved. */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, - const Coordinates &starts, const Coordinates &ends, const BiStrides &strides, - int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const Coordinates &starts, + const Coordinates &ends, + const BiStrides &strides, + int32_t begin_mask = 0, + int32_t end_mask = 0, + int32_t shrink_axis_mask = 0); }; } // namespace experimental } // namespace arm_compute diff --git a/arm_compute/runtime/NEON/functions/NETile.h b/arm_compute/runtime/NEON/functions/NETile.h index 915e5aa1da..001a0a4128 100644 --- a/arm_compute/runtime/NEON/functions/NETile.h +++ b/arm_compute/runtime/NEON/functions/NETile.h @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NETILE_H #define ARM_COMPUTE_NETILE_H -#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h" namespace arm_compute { diff --git a/arm_compute/runtime/NEON/functions/NETranspose.h b/arm_compute/runtime/NEON/functions/NETranspose.h index 581fe74309..5d2d1f1b01 100644 --- a/arm_compute/runtime/NEON/functions/NETranspose.h +++ b/arm_compute/runtime/NEON/functions/NETranspose.h @@ -24,9 +24,8 @@ #ifndef ARM_COMPUTE_NETRANSPOSE_H #define ARM_COMPUTE_NETRANSPOSE_H -#include "arm_compute/runtime/IFunction.h" - #include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" #include <memory> @@ -83,4 +82,4 @@ private: std::unique_ptr<Impl> _impl; }; } // namespace arm_compute -#endif /* ARM_COMPUTE_NETRANSPOSE_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_NETRANSPOSE_H */ diff --git a/arm_compute/runtime/NEON/functions/NEUnstack.h b/arm_compute/runtime/NEON/functions/NEUnstack.h index 079fee5b9e..e1af96d08d 100644 --- a/arm_compute/runtime/NEON/functions/NEUnstack.h +++ b/arm_compute/runtime/NEON/functions/NEUnstack.h @@ -26,7 +26,6 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" - #include "arm_compute/runtime/NEON/functions/NEStridedSlice.h" #include <memory> diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h index f9ebf608cb..6caa2aeb59 100644 --- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,17 +21,12 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_NEWINOGRADCONVOLUTIONLAYER_H -#define ARM_COMPUTE_NEWINOGRADCONVOLUTIONLAYER_H - -#include "arm_compute/runtime/IFunction.h" +#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEWINOGRADCONVOLUTIONLAYER_H +#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEWINOGRADCONVOLUTIONLAYER_H #include "arm_compute/core/Types.h" -#include "arm_compute/runtime/CPP/functions/CPPPermute.h" -#include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" -#include "arm_compute/runtime/NEON/functions/NEGEMM.h" - +#include "arm_compute/function_info/ActivationLayerInfo.h" +#include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/Tensor.h" #include <memory> @@ -40,13 +35,11 @@ namespace arm_compute { // Forward declarations class ITensor; -class ICPPKernel; /** Basic function to simulate a convolution layer. This function calls the following kernels: * - * -# @ref NEWinogradLayerTransformWeightsKernel (executed only once in the first call to the run() method ) - * -# @ref NEWinogradLayerTransformInputKernel - * -# @ref NEWinogradLayerTransformOutputKernel + * -# @ref cpu::CpuWinogradConv2dTransformInputKernel + * -# @ref cpu::CpuWinogradConv2dTransformOutputKernel * -# @ref cpu::CpuGemmAssemblyDispatch * -# @ref CPPPermute (three times: weights, input and output) * @@ -57,12 +50,16 @@ class NEWinogradConvolutionLayer : public IFunction public: /** Constructor */ NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr); - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEWinogradConvolutionLayer(NEWinogradConvolutionLayer &&) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - NEWinogradConvolutionLayer &operator=(NEWinogradConvolutionLayer &&) = delete; - /** Default destructor */ - ~NEWinogradConvolutionLayer() = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEWinogradConvolutionLayer(const NEWinogradConvolutionLayer &) = delete; + /** Default move constructor */ + NEWinogradConvolutionLayer(NEWinogradConvolutionLayer &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEWinogradConvolutionLayer &operator=(const NEWinogradConvolutionLayer &) = delete; + /** Default move assignment operator */ + NEWinogradConvolutionLayer &operator=(NEWinogradConvolutionLayer &&) = default; + /** Destructor */ + ~NEWinogradConvolutionLayer(); /** Set the input and output tensors. * @@ -80,7 +77,8 @@ public: * while every optional dimension from 4 and above represent a batch of inputs. * Data types supported: F16/F32. * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input. - * Currently only 3x3 and 5x5 kernels are supported. + * Supported kernel sizes: (height, width) -> 3x3, 1x3, 3x1, 5x5, 1x5, 5x1 for Fp32 + * -> 3x3 for Fp16 * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. * Data types supported: Same as @p input. @@ -89,63 +87,35 @@ public: * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation * available which may introduce a drop of accuracy as well. Default is false */ - void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo(), - bool enable_fast_math = false); + void configure(const ITensor *input, + const ITensor *weights, + const ITensor *biases, + ITensor *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); // Inherited methods overridden: void run() override; void prepare() override; - /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer + /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradConvolutionLayer * - * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. - * Currently only 3x3 and 5x5 kernels are supported. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. - * @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. - * Data types supported: Same as @p input. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. Currently only unit strides are supported. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation - * available which may introduce a drop of accuracy as well. Default is false + * Similar to @ref NEWinogradConvolutionLayer::configure() * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); - - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEWinogradConvolutionLayer(const NEWinogradConvolutionLayer &) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEWinogradConvolutionLayer &operator=(const NEWinogradConvolutionLayer &) = delete; + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); private: - MemoryGroup _memory_group; - NEGEMM _gemm_function; - std::unique_ptr<ICPPKernel> _transform_input_kernel; - std::unique_ptr<ICPPKernel> _transform_output_kernel; - std::unique_ptr<ICPPKernel> _transform_weights_kernel; - NEActivationLayer _activationlayer_function; - - CPPPermute _permute_input; - CPPPermute _permute_weights; - CPPPermute _permute_output; - Tensor _input_transformed; - Tensor _output_transformed; - Tensor _input_workspace; - Tensor _output_workspace; - Tensor _kernel_storage; - Tensor _input_nhwc; - Tensor _output_nhwc; - Tensor _weights_hwio; - const ITensor *_input; - const ITensor *_weights; - ITensor *_output; - bool _is_prepared; - bool _is_activationlayer_enabled; - DataLayout _data_layout; + struct Impl; + std::unique_ptr<Impl> _impl; }; } // namespace arm_compute -#endif /* ARM_COMPUTE_NEWINOGRADCONVOLUTIONLAYER_H */ +#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEWINOGRADCONVOLUTIONLAYER_H |