From 7891a73ef36f4ad7b71069b3c57694f85bb79454 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 20 Aug 2021 21:39:25 +0100 Subject: Move CPU/GPU files from Core/Runtime to the respective backend folders Legacy structure contained two libraries core/runtime with two backends in each. We reduce the core/runtime libraries to a single library thus merging the backend files Signed-off-by: Georgios Pinitas Change-Id: I69545765fe7a730368105cdbd067d3135ec7a174 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6155 Comments-Addressed: Arm Jenkins Reviewed-by: Michele Di Giorgio Tested-by: Arm Jenkins --- src/runtime/cpu/ICpuOperator.h | 36 - src/runtime/cpu/operators/CpuActivation.cpp | 72 -- src/runtime/cpu/operators/CpuActivation.h | 54 -- src/runtime/cpu/operators/CpuAdd.cpp | 46 -- src/runtime/cpu/operators/CpuAdd.h | 68 -- src/runtime/cpu/operators/CpuCast.cpp | 44 -- src/runtime/cpu/operators/CpuCast.h | 71 -- src/runtime/cpu/operators/CpuConcatenate.cpp | 168 ----- src/runtime/cpu/operators/CpuConcatenate.h | 76 -- src/runtime/cpu/operators/CpuConv2d.cpp | 253 ------- src/runtime/cpu/operators/CpuConv2d.h | 146 ---- .../operators/CpuConvertFullyConnectedWeights.cpp | 50 -- .../operators/CpuConvertFullyConnectedWeights.h | 57 -- src/runtime/cpu/operators/CpuCopy.cpp | 44 -- src/runtime/cpu/operators/CpuCopy.h | 53 -- src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp | 498 ------------ src/runtime/cpu/operators/CpuDepthwiseConv2d.h | 209 ----- .../CpuDepthwiseConv2dAssemblyDispatch.cpp | 135 ---- .../operators/CpuDepthwiseConv2dAssemblyDispatch.h | 80 -- src/runtime/cpu/operators/CpuDequantize.cpp | 54 -- src/runtime/cpu/operators/CpuDequantize.h | 56 -- src/runtime/cpu/operators/CpuDirectConv2d.cpp | 147 ---- src/runtime/cpu/operators/CpuDirectConv2d.h | 105 --- src/runtime/cpu/operators/CpuElementwise.cpp | 124 --- src/runtime/cpu/operators/CpuElementwise.h | 185 ----- src/runtime/cpu/operators/CpuElementwiseUnary.cpp | 58 -- src/runtime/cpu/operators/CpuElementwiseUnary.h | 59 -- src/runtime/cpu/operators/CpuFill.cpp | 39 - src/runtime/cpu/operators/CpuFill.h | 46 -- src/runtime/cpu/operators/CpuFlatten.cpp | 44 -- src/runtime/cpu/operators/CpuFlatten.h | 64 -- src/runtime/cpu/operators/CpuFloor.cpp | 44 -- src/runtime/cpu/operators/CpuFloor.h | 53 -- src/runtime/cpu/operators/CpuFullyConnected.cpp | 496 ------------ src/runtime/cpu/operators/CpuFullyConnected.h | 147 ---- src/runtime/cpu/operators/CpuGemm.cpp | 367 --------- src/runtime/cpu/operators/CpuGemm.h | 145 ---- src/runtime/cpu/operators/CpuGemmConv2d.cpp | 612 --------------- src/runtime/cpu/operators/CpuGemmConv2d.h | 203 ----- src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp | 222 ------ src/runtime/cpu/operators/CpuGemmDirectConv2d.h | 106 --- .../operators/CpuGemmLowpMatrixMultiplyCore.cpp | 711 ----------------- .../cpu/operators/CpuGemmLowpMatrixMultiplyCore.h | 174 ----- .../cpu/operators/CpuGemmLowpOutputStage.cpp | 147 ---- src/runtime/cpu/operators/CpuGemmLowpOutputStage.h | 86 --- src/runtime/cpu/operators/CpuMul.cpp | 77 -- src/runtime/cpu/operators/CpuMul.h | 105 --- src/runtime/cpu/operators/CpuPRelu.h | 38 - src/runtime/cpu/operators/CpuPermute.cpp | 44 -- src/runtime/cpu/operators/CpuPermute.h | 56 -- src/runtime/cpu/operators/CpuPool2d.cpp | 158 ---- src/runtime/cpu/operators/CpuPool2d.h | 85 --- src/runtime/cpu/operators/CpuQuantize.cpp | 58 -- src/runtime/cpu/operators/CpuQuantize.h | 56 -- src/runtime/cpu/operators/CpuReshape.cpp | 44 -- src/runtime/cpu/operators/CpuReshape.h | 53 -- src/runtime/cpu/operators/CpuScale.cpp | 250 ------ src/runtime/cpu/operators/CpuScale.h | 69 -- src/runtime/cpu/operators/CpuSoftmax.cpp | 221 ------ src/runtime/cpu/operators/CpuSoftmax.h | 111 --- src/runtime/cpu/operators/CpuSub.cpp | 46 -- src/runtime/cpu/operators/CpuSub.h | 66 -- src/runtime/cpu/operators/CpuTranspose.cpp | 44 -- src/runtime/cpu/operators/CpuTranspose.h | 53 -- src/runtime/cpu/operators/CpuWinogradConv2d.cpp | 839 --------------------- src/runtime/cpu/operators/CpuWinogradConv2d.h | 136 ---- .../operators/internal/CpuGemmAssemblyDispatch.cpp | 721 ------------------ .../operators/internal/CpuGemmAssemblyDispatch.h | 123 --- src/runtime/cpu/utils/CpuAuxTensorHandler.h | 111 --- 69 files changed, 10218 deletions(-) delete mode 100644 src/runtime/cpu/ICpuOperator.h delete mode 100644 src/runtime/cpu/operators/CpuActivation.cpp delete mode 100644 src/runtime/cpu/operators/CpuActivation.h delete mode 100644 src/runtime/cpu/operators/CpuAdd.cpp delete mode 100644 src/runtime/cpu/operators/CpuAdd.h delete mode 100644 src/runtime/cpu/operators/CpuCast.cpp delete mode 100644 src/runtime/cpu/operators/CpuCast.h delete mode 100644 src/runtime/cpu/operators/CpuConcatenate.cpp delete mode 100644 src/runtime/cpu/operators/CpuConcatenate.h delete mode 100644 src/runtime/cpu/operators/CpuConv2d.cpp delete mode 100644 src/runtime/cpu/operators/CpuConv2d.h delete mode 100644 src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp delete mode 100644 src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h delete mode 100644 src/runtime/cpu/operators/CpuCopy.cpp delete mode 100644 src/runtime/cpu/operators/CpuCopy.h delete mode 100644 src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp delete mode 100644 src/runtime/cpu/operators/CpuDepthwiseConv2d.h delete mode 100644 src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp delete mode 100644 src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h delete mode 100644 src/runtime/cpu/operators/CpuDequantize.cpp delete mode 100644 src/runtime/cpu/operators/CpuDequantize.h delete mode 100644 src/runtime/cpu/operators/CpuDirectConv2d.cpp delete mode 100644 src/runtime/cpu/operators/CpuDirectConv2d.h delete mode 100644 src/runtime/cpu/operators/CpuElementwise.cpp delete mode 100644 src/runtime/cpu/operators/CpuElementwise.h delete mode 100644 src/runtime/cpu/operators/CpuElementwiseUnary.cpp delete mode 100644 src/runtime/cpu/operators/CpuElementwiseUnary.h delete mode 100644 src/runtime/cpu/operators/CpuFill.cpp delete mode 100644 src/runtime/cpu/operators/CpuFill.h delete mode 100644 src/runtime/cpu/operators/CpuFlatten.cpp delete mode 100644 src/runtime/cpu/operators/CpuFlatten.h delete mode 100644 src/runtime/cpu/operators/CpuFloor.cpp delete mode 100644 src/runtime/cpu/operators/CpuFloor.h delete mode 100644 src/runtime/cpu/operators/CpuFullyConnected.cpp delete mode 100644 src/runtime/cpu/operators/CpuFullyConnected.h delete mode 100644 src/runtime/cpu/operators/CpuGemm.cpp delete mode 100644 src/runtime/cpu/operators/CpuGemm.h delete mode 100644 src/runtime/cpu/operators/CpuGemmConv2d.cpp delete mode 100644 src/runtime/cpu/operators/CpuGemmConv2d.h delete mode 100644 src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp delete mode 100644 src/runtime/cpu/operators/CpuGemmDirectConv2d.h delete mode 100644 src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp delete mode 100644 src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h delete mode 100644 src/runtime/cpu/operators/CpuGemmLowpOutputStage.cpp delete mode 100644 src/runtime/cpu/operators/CpuGemmLowpOutputStage.h delete mode 100644 src/runtime/cpu/operators/CpuMul.cpp delete mode 100644 src/runtime/cpu/operators/CpuMul.h delete mode 100644 src/runtime/cpu/operators/CpuPRelu.h delete mode 100644 src/runtime/cpu/operators/CpuPermute.cpp delete mode 100644 src/runtime/cpu/operators/CpuPermute.h delete mode 100644 src/runtime/cpu/operators/CpuPool2d.cpp delete mode 100644 src/runtime/cpu/operators/CpuPool2d.h delete mode 100644 src/runtime/cpu/operators/CpuQuantize.cpp delete mode 100644 src/runtime/cpu/operators/CpuQuantize.h delete mode 100644 src/runtime/cpu/operators/CpuReshape.cpp delete mode 100644 src/runtime/cpu/operators/CpuReshape.h delete mode 100644 src/runtime/cpu/operators/CpuScale.cpp delete mode 100644 src/runtime/cpu/operators/CpuScale.h delete mode 100644 src/runtime/cpu/operators/CpuSoftmax.cpp delete mode 100644 src/runtime/cpu/operators/CpuSoftmax.h delete mode 100644 src/runtime/cpu/operators/CpuSub.cpp delete mode 100644 src/runtime/cpu/operators/CpuSub.h delete mode 100644 src/runtime/cpu/operators/CpuTranspose.cpp delete mode 100644 src/runtime/cpu/operators/CpuTranspose.h delete mode 100644 src/runtime/cpu/operators/CpuWinogradConv2d.cpp delete mode 100644 src/runtime/cpu/operators/CpuWinogradConv2d.h delete mode 100644 src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp delete mode 100644 src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h delete mode 100644 src/runtime/cpu/utils/CpuAuxTensorHandler.h (limited to 'src/runtime/cpu') diff --git a/src/runtime/cpu/ICpuOperator.h b/src/runtime/cpu/ICpuOperator.h deleted file mode 100644 index 70ab4364c7..0000000000 --- a/src/runtime/cpu/ICpuOperator.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_ICPUOPERATOR_H -#define ARM_COMPUTE_ICPUOPERATOR_H - -#include "arm_compute/runtime/NEON/INEOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -using ICpuOperator = experimental::INEOperator; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_ICPUOPERATOR_H */ diff --git a/src/runtime/cpu/operators/CpuActivation.cpp b/src/runtime/cpu/operators/CpuActivation.cpp deleted file mode 100644 index 0b43b322ad..0000000000 --- a/src/runtime/cpu/operators/CpuActivation.cpp +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuActivation.h" - -#include "src/common/IOperator.h" -#include "src/common/utils/LegacySupport.h" -#include "src/core/cpu/kernels/CpuActivationKernel.h" -#include "src/cpu/CpuContext.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuActivation::configure(const ITensorInfo *input, ITensorInfo *output, const ActivationLayerInfo &activation_info) -{ - auto k = std::make_unique(); - k->configure(input, output, activation_info); - _kernel = std::move(k); -} - -Status CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info) -{ - return kernels::CpuActivationKernel::validate(input, output, activation_info); -} - -std::tuple CpuContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate) -{ - TensorInfo src_info = detail::convert_to_legacy_tensor_info(src); - TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst); - auto info = detail::convert_to_activation_info(act); - - if(is_validate && !bool(CpuActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info))) - { - return std::make_tuple(nullptr, StatusCode::UnsupportedConfig); - } - - auto act_op = std::make_unique(); - act_op->configure(&src_info, &dst_info, info); - - auto op = new arm_compute::IOperator(static_cast(this)); - if(op == nullptr) - { - ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources"); - return std::make_tuple(nullptr, StatusCode::OutOfMemory); - } - op->set_internal_operator(std::move(act_op)); - - return std::make_tuple(op, StatusCode::Success); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuActivation.h b/src/runtime/cpu/operators/CpuActivation.h deleted file mode 100644 index ded4a37edb..0000000000 --- a/src/runtime/cpu/operators/CpuActivation.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_ACTIVATION_H -#define ARM_COMPUTE_CPU_ACTIVATION_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuActivationKernel */ -class CpuActivation : public ICpuOperator -{ -public: - /** Configure operator for a given list of arguments - * - * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32. - * @param[out] output Destination tensor info. Data type supported: same as @p src - * @param[in] activation_info Activation layer parameters. - */ - void configure(const ITensorInfo *input, ITensorInfo *output, const ActivationLayerInfo &activation_info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuActivation::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info); -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_ACTIVATION_H */ diff --git a/src/runtime/cpu/operators/CpuAdd.cpp b/src/runtime/cpu/operators/CpuAdd.cpp deleted file mode 100644 index 23b09aca4f..0000000000 --- a/src/runtime/cpu/operators/CpuAdd.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuAdd.h" - -#include "src/core/cpu/kernels/CpuAddKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuAdd::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(act_info); - auto k = std::make_unique(); - k->configure(src0, src1, dst, policy); - _kernel = std::move(k); -} - -Status CpuAdd::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); - return kernels::CpuAddKernel::validate(src0, src1, dst, policy); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuAdd.h b/src/runtime/cpu/operators/CpuAdd.h deleted file mode 100644 index 3ff135fe41..0000000000 --- a/src/runtime/cpu/operators/CpuAdd.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_ADD_H -#define ARM_COMPUTE_CPU_ADD_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuAddKernel */ -class CpuAdd : public ICpuOperator -{ -public: - /** Initialise the kernel's input, dst and border mode. - * - * Valid configurations (src0,src1) -> dst : - * - * - (U8,U8) -> U8 - * - (S16,S16) -> S16 - * - (S32,S32) -> S32 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - (QASYMM8,QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (QSYMM16,QSYMM16) -> QSYMM16 - * - * @param[in] src0 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 - * @param[in] src1 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 - * @param[out] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. - * @param[in] policy Overflow policy. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. - * - */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuAdd::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_ADD_H */ diff --git a/src/runtime/cpu/operators/CpuCast.cpp b/src/runtime/cpu/operators/CpuCast.cpp deleted file mode 100644 index 5a4f6c518e..0000000000 --- a/src/runtime/cpu/operators/CpuCast.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuCast.h" - -#include "src/core/cpu/kernels/CpuCastKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuCast::configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy) -{ - auto k = std::make_unique(); - k->configure(src, dst, policy); - _kernel = std::move(k); -} - -Status CpuCast::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy) -{ - return kernels::CpuCastKernel::validate(src, dst, policy); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuCast.h b/src/runtime/cpu/operators/CpuCast.h deleted file mode 100644 index 26f5740b86..0000000000 --- a/src/runtime/cpu/operators/CpuCast.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_CAST_H -#define ARM_COMPUTE_CPU_CAST_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuCastKernel */ -class CpuCast : public ICpuOperator -{ -public: - /** Configure operator for a given list of arguments - * - * Input data type must be different than output data type. - * - * Valid data layouts: - * - All - * - * Valid data type configurations: - * |src |dst | - * |:--------------|:-----------------------------------------------| - * |QASYMM8_SIGNED | S16, S32, F32, F16 | - * |QASYMM8 | U16, S16, S32, F32, F16 | - * |U8 | U16, S16, S32, F32, F16 | - * |U16 | U8, U32 | - * |S16 | QASYMM8_SIGNED, U8, S32 | - * |F16 | QASYMM8_SIGNED, QASYMM8, F32, S32, U8 | - * |S32 | QASYMM8_SIGNED, QASYMM8, F16, F32, U8 | - * |F32 | QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8| - * - * @param[in] src The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. - * @param[out] dst The destination tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. - * @param[in] policy Conversion policy. - */ - void configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuCast::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy); -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_ACTIVATION_H */ diff --git a/src/runtime/cpu/operators/CpuConcatenate.cpp b/src/runtime/cpu/operators/CpuConcatenate.cpp deleted file mode 100644 index bb475b790e..0000000000 --- a/src/runtime/cpu/operators/CpuConcatenate.cpp +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuConcatenate.h" - -#include "src/core/cpu/kernels/CpuConcatenateBatchKernel.h" -#include "src/core/cpu/kernels/CpuConcatenateDepthKernel.h" -#include "src/core/cpu/kernels/CpuConcatenateHeightKernel.h" -#include "src/core/cpu/kernels/CpuConcatenateWidthKernel.h" - -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "src/core/helpers/AutoConfiguration.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuConcatenate::configure(const std::vector &srcs_vector, ITensorInfo *dst, size_t axis) -{ - ARM_COMPUTE_ERROR_ON(dst == nullptr); - - _axis = axis; - _num_srcs = srcs_vector.size(); - - TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis); - - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*dst, dst_shape, 1, srcs_vector[0]->data_type()); - ARM_COMPUTE_ERROR_THROW_ON(CpuConcatenate::validate(srcs_vector, dst, axis)); - - unsigned int offset = 0; - - for(unsigned int i = 0; i < _num_srcs; ++i) - { - switch(axis) - { - case Window::DimX: - { - auto kernel = std::make_unique(); - kernel->configure(srcs_vector.at(i), offset, dst); - _concat_kernels.emplace_back(std::move(kernel)); - break; - } - case Window::DimY: - { - auto kernel = std::make_unique(); - kernel->configure(srcs_vector.at(i), offset, dst); - _concat_kernels.emplace_back(std::move(kernel)); - break; - } - case Window::DimZ: - { - auto kernel = std::make_unique(); - kernel->configure(srcs_vector.at(i), offset, dst); - _concat_kernels.emplace_back(std::move(kernel)); - break; - } - case 3: - { - auto kernel = std::make_unique(); - kernel->configure(srcs_vector.at(i), offset, dst); - _concat_kernels.emplace_back(std::move(kernel)); - break; - } - default: - ARM_COMPUTE_ERROR("Axis not supported"); - } - offset += srcs_vector.at(i)->dimension(axis); - } -} - -Status CpuConcatenate::validate(const std::vector &srcs_vector, const ITensorInfo *dst, size_t axis) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); - ARM_COMPUTE_RETURN_ERROR_ON(srcs_vector.size() < 2); - - unsigned int offset = 0; - for(const auto &src : srcs_vector) - { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); - switch(axis) - { - case Window::DimX: - { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateWidthKernel::validate(src, offset, dst)); - break; - } - case Window::DimY: - { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateHeightKernel::validate(src, offset, dst)); - break; - } - case Window::DimZ: - { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateDepthKernel::validate(src, offset, dst)); - break; - } - case 3: - { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateBatchKernel::validate(src, offset, dst)); - break; - } - default: - ARM_COMPUTE_ERROR("Axis not supported"); - } - offset += src->dimension(axis); - } - - if(dst->total_size() != 0) - { - TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis); - ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size()); - } - - return Status{}; -} - -void CpuConcatenate::run(ITensorPack &tensors) -{ - if(tensors.empty()) - { - ARM_COMPUTE_ERROR("No inputs provided"); - } - - if(static_cast(tensors.size() - 1) != static_cast(_num_srcs)) - { - ARM_COMPUTE_ERROR("Configured with different number of inputs"); - } - - int i = 0; - for(auto &k : _concat_kernels) - { - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i)); - pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST)); - NEScheduler::get().schedule_op(k.get(), Window::DimY, k->window(), pack); - ++i; - } -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuConcatenate.h b/src/runtime/cpu/operators/CpuConcatenate.h deleted file mode 100644 index 55eab54996..0000000000 --- a/src/runtime/cpu/operators/CpuConcatenate.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_CONCATENATE_H -#define ARM_COMPUTE_CPU_CONCATENATE_H - -#include "src/core/cpu/ICpuKernel.h" -#include "src/runtime/cpu/ICpuOperator.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels: - * - * -# @ref kernels::CpuConcatenateWidthKernel (if underlying concatenation axis is 0). - * -# @ref kernels::CpuConcatenateHeightKernel (if underlying concatenation axis is 1). - * -# @ref kernels::CpuConcatenateDepthKernel (if underlying concatenation axis is 2). - * -# @ref kernels::CpuConcatenateBatchKernel (if underlying concatenation axis is 3). - */ -class CpuConcatenate : public ICpuOperator -{ -public: - CpuConcatenate() = default; - /** Configure operator for a given list of arguments - * - * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis. - * @note Preconditions can be found respectively at @ref kernels::CpuConcatenateWidthKernel, @ref kernels::CpuConcatenateHeightKernel, - * @ref kernels::CpuConcatenateDepthKernel and @ref kernels::CpuConcatenateBatchKernel. - * - * @param[in,out] srcs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[out] dst Output tensor. Data types supported: Same as @p srcs_vector. - * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3. - */ - void configure(const std::vector &srcs_vector, ITensorInfo *dst, size_t axis); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuConcatenate::configure() - * - * @return a status - */ - static Status validate(const std::vector &srcs_vector, const ITensorInfo *dst, size_t axis); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - -private: - std::vector> _concat_kernels{}; - unsigned int _num_srcs{ 0 }; - unsigned int _axis{ 0 }; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_CONCATENATE_H */ diff --git a/src/runtime/cpu/operators/CpuConv2d.cpp b/src/runtime/cpu/operators/CpuConv2d.cpp deleted file mode 100644 index cff9238308..0000000000 --- a/src/runtime/cpu/operators/CpuConv2d.cpp +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuConv2d.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h" -#include "src/runtime/cpu/operators/CpuDirectConv2d.h" -#include "src/runtime/cpu/operators/CpuGemm.h" -#include "src/runtime/cpu/operators/CpuGemmConv2d.h" -#include "src/runtime/cpu/operators/CpuGemmDirectConv2d.h" -#include "src/runtime/cpu/operators/CpuWinogradConv2d.h" - -namespace arm_compute -{ -namespace cpu -{ -CpuConv2d::CpuConv2d() - : _function() -{ -} - -CpuConv2d::~CpuConv2d() = default; - -void CpuConv2d::configure(ITensorInfo *input, ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); - ARM_COMPUTE_UNUSED(num_groups); - ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, - enable_fast_math, num_groups)); - - const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math)) - { - case ConvolutionMethod::WINOGRAD: - { - auto f = std::make_unique(); - f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math); - _function = std::move(f); - break; - } - case ConvolutionMethod::GEMM: - { - auto f = std::make_unique(); - f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math); - _function = std::move(f); - break; - } - case ConvolutionMethod::GEMM_CONV2D: - { - auto f = std::make_unique(); - f->configure(input, weights, biases, output, info); - _function = std::move(f); - break; - } - case ConvolutionMethod::DIRECT: - { - auto f = std::make_unique(); - f->configure(input, weights, biases, output, conv_info, act_info); - _function = std::move(f); - break; - } - default: - ARM_COMPUTE_ERROR("Not supported."); - break; - } - - _aux_mem = _function->workspace(); -} - -Status CpuConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) -{ - ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on Neon"); - - const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); - switch(CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math)) - { - case ConvolutionMethod::WINOGRAD: - ARM_COMPUTE_RETURN_ON_ERROR(CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math)); - break; - case ConvolutionMethod::GEMM: - ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math)); - break; - case ConvolutionMethod::GEMM_CONV2D: - ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmDirectConv2d::validate(input, weights, biases, output, info)); - break; - case ConvolutionMethod::DIRECT: - ARM_COMPUTE_RETURN_ON_ERROR(CpuDirectConv2d::validate(input, weights, biases, output, conv_info, act_info)); - break; - default: - ARM_COMPUTE_ERROR("Not supported."); - break; - } - - return Status{}; -} - -ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights); - ARM_COMPUTE_UNUSED(weights_info); - - const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); - - const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, 1); - - /* Input spatial dims, kernel size, IFM/OFM, conv info*/ - using ConvolutionConfiguration = std::tuple; - using ConfigurationMethod = std::pair; - - const std::vector known_configs = - { - // Alexnet - ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U)), ConvolutionMethod::GEMM), - // VGG16 / VGG19 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), ConvolutionMethod::GEMM), - // Mobilenet 224 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM), - // Mobilenet 160 - ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM) - }; - - const auto find_config = [&](ConfigurationMethod c) - { - const ConvolutionConfiguration config = c.first; - const PadStrideInfo info = std::get<3>(config); - - return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) - && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() - && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride(); - }; - - std::vector::const_iterator found; - if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) - { - return (*found).second; - } - - if(dilation != Size2D(1U, 1U)) - { - return ConvolutionMethod::GEMM; - } - else - { - // SRGAN - // Output might not be initialized when it is an internal tensor of the layer using the convolution - if(input->total_size() > 1e7 && (weights->dimension(idx_h) > 7) - && (CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info))) - { - return ConvolutionMethod::DIRECT; - } - if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && (NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info))) - { - return ConvolutionMethod::FFT; - } - if(input->dimension(idx_c) < 16) - { - return ConvolutionMethod::GEMM; - } - -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - // This heuristics only applies to F16 data type on A55r1 - if(NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math && input->data_type() == DataType::F16) - { - // Exclude known bad winograd configs (and defaults to GEMM) - const std::vector known_bad_winograd_f16_with_fastmath_configs = - { - // Squeezenet_V1_1 fire2 and fire3 - ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), - // Squeezenet_V1_1 fire6 and fire7 - ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U), PadStrideInfo(1U, 1U, 1U, 1U)), - // Squeezenet_V1_1 fire8 and fire9 - ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U), PadStrideInfo(1U, 1U, 1U, 1U)), - }; - const auto find_conv_config = [&](ConvolutionConfiguration c) - { - const PadStrideInfo info = std::get<3>(c); - - return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) - && std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() - && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride(); - }; - - bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(), known_bad_winograd_f16_with_fastmath_configs.end(), - find_conv_config) - != known_bad_winograd_f16_with_fastmath_configs.end(); - if(found_bad) - { - return ConvolutionMethod::GEMM; - } - } -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - // For 1x1 convolutions run the default GEMM - if(weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1) - { - return ConvolutionMethod::GEMM; - } - - if(bool(CpuWinogradConv2d::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math))) - { - return ConvolutionMethod::WINOGRAD; - } - if(bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info))) - { - return ConvolutionMethod::GEMM_CONV2D; - } - return ConvolutionMethod::GEMM; - } -} - -void CpuConv2d::run(ITensorPack &tensors) -{ - prepare(tensors); - _function->run(tensors); -} - -void CpuConv2d::prepare(ITensorPack &tensors) -{ - _function->prepare(tensors); -} - -experimental::MemoryRequirements CpuConv2d::workspace() const -{ - return _aux_mem; -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuConv2d.h b/src/runtime/cpu/operators/CpuConv2d.h deleted file mode 100644 index d7b42deea1..0000000000 --- a/src/runtime/cpu/operators/CpuConv2d.h +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/core/common/Macros.h" -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to simulate a convolution layer. This function calls one of the following functions: - * -# @ref CpuGemm (executed only in case GEMM is required for the operation) - * -# @ref CpuWinogradConv2d (executed only in case Winograd is required for the operation) - * -# @ref CpuDirectConv2d (executed only in case Direct Convolution is required for the operation) - * - * - * The function selects one of the algorithms mentioned above based on: - * - The size of the kernel - * - Number of input/output feature maps - * - Amount of memory needed - * - * Generally GEMM-based convolution is executed when neither Winograd nor FFT nor Direct convolution can be performed. - * - * FP32 Algorithm| Filter Size | Input/Output feature maps | - * --------------|----------------------------------------------------|-------------------------------------------| - * Winograd | 3x3 1x3 3x1 5x1 1x5 5x5(fast maths) 7x1 1x7 | Input channels is greater than 3 | - * FFT | Squared kernels and greater than 9x9 | Input feature maps > Output feature maps | - * DirectConv | 9x9 | | - * GEMM | Any size | | - * - * Winograd 5x5 requires fast maths enabled. - * - * FP16 Algorithm| Filter Size | - * --------------|------------------| - * Winograd | Not supported | - * FFT | Not supported | - * DirectConv | 9x9 | - * GEMM | Any size | - * - * - */ -class CpuConv2d : public ICpuOperator -{ -public: - /** Constructor */ - CpuConv2d(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConv2d); - /** Default destructor */ - ~CpuConv2d(); - /** Set the input and output tensors. - * - * Valid data layouts: - * - NHWC - * - NCHW - * - * Valid data type configurations: - * |src0 |src1 |src2 |dst | - * |:--------------|:------------------|:------|:--------------| - * |F16 |F16 |F16 |F16 | - * |F32 |F32 |F32 |F32 | - * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | - * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | - * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | - * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | - * - * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. - * Data type supported: Same as @p src, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED. - * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Same as @p src, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. - * @param[out] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. - * Data types supported: Same as @p src. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. - * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation - * available which may introduce a drop of accuracy as well. Default is false - * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported - */ - void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(), - const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1); - /** Static function to check if given info will lead to a valid configuration of @ref CpuConv2d - * - * Similar to CpuConv2d::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, - unsigned int num_groups = 1); - /** Static function to check if given info will return the convolution called by @ref CpuConv2d - * - * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. - * Data type supported:Same as @p src, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED. - * @param[in] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. - * Data types supported: Same as @p src. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation - * available which may introduce a drop of accuracy as well. Default is false - * - * @return the Convolution Method Hint - */ - static ConvolutionMethod get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; - experimental::MemoryRequirements workspace() const override; - -private: - std::unique_ptr _function; - experimental::MemoryRequirements _aux_mem{}; -}; -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp b/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp deleted file mode 100644 index 3f2f4e95cf..0000000000 --- a/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h" - -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout) -{ - auto k = std::make_unique(); - k->configure(src, dst, original_src_shape, data_layout); - _kernel = std::move(k); -} - -Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout) -{ - return kernels::CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout); -} - -void CpuConvertFullyConnectedWeights::run(ITensorPack &tensors) -{ - NEScheduler::get().schedule_op(_kernel.get(), Window::DimZ, _kernel->window(), tensors); -} -} // namesapce cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h b/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h deleted file mode 100644 index 53ee17f6d1..0000000000 --- a/src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_H -#define ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuConvertFullyConnectedWeightsKernel */ -class CpuConvertFullyConnectedWeights : public ICpuOperator -{ -public: - /** Configure operator for a given list of arguments - * - * @param[in] src Source tensor to permute. Data types supported: All - * @param[out] dst Destintation tensor. Data types supported: Same as @p src - * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer). - * @param[in] data_layout The data layout the weights have been trained in. - */ - void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuConvertFullyConnectedWeights::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); - // Inherited methods overridden: - void run(ITensorPack &tensors) override; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_H */ diff --git a/src/runtime/cpu/operators/CpuCopy.cpp b/src/runtime/cpu/operators/CpuCopy.cpp deleted file mode 100644 index 9fbe916163..0000000000 --- a/src/runtime/cpu/operators/CpuCopy.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuCopy.h" - -#include "src/core/cpu/kernels/CpuCopyKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuCopy::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique(); - k->configure(src, dst); - _kernel = std::move(k); -} - -Status CpuCopy::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::CpuCopyKernel::validate(src, dst); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuCopy.h b/src/runtime/cpu/operators/CpuCopy.h deleted file mode 100644 index 861bbb7849..0000000000 --- a/src/runtime/cpu/operators/CpuCopy.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_COPY_H -#define ARM_COMPUTE_CPU_COPY_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuCopyKernel */ -class CpuCopy : public ICpuOperator -{ -public: - /** Configure operator for a given list of arguments - * - * @param[in] src Source tensor info. Data type supported: All - * @param[out] dst Destination info. Data type supported: Same as @p src - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuCopy::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_COPY_H */ diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp b/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp deleted file mode 100644 index 8141487125..0000000000 --- a/src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp +++ /dev/null @@ -1,498 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuDepthwiseConv2d.h" - -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/InfoHelpers.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace -{ -Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - if(!is_data_type_quantized_per_channel(weights->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); - } - ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); - ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1); - const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > src->dimension(idx_w) + info.pad_stride_info.pad_left() + - info.pad_stride_info.pad_right()); - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > src->dimension(idx_h) + info.pad_stride_info.pad_top() + - info.pad_stride_info.pad_bottom()); - - if(biases != nullptr) - { - const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx)); - } - - ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, biases, dst, info)); - - // Validate Activation Layer - if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info)) - { - ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info)); - } - return Status{}; -} -} // namespace - -void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorInfo *src, - const ITensorInfo *weights, - const ITensorInfo *biases, - ITensorInfo *dst, - const ConvolutionInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - // Perform validation step - ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases, - dst, info)); - - _is_quantized = is_data_type_quantized_asymmetric(src->data_type()); - _has_bias = biases != nullptr; - _is_nchw = src->data_layout() == DataLayout::NCHW; - _permute = _is_nchw; - _is_prepared = false; - - // Configure pipeline - _is_activationlayer_enabled = info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info); - - _dwc_optimized_func = std::make_unique(); - if(_is_nchw) - { - _permute_input = std::make_unique(); - _permute_weights = std::make_unique(); - _permute_output = std::make_unique(); - - auto input_perm = std::make_unique(); - auto weights_perm = std::make_unique(); - auto output_perm = std::make_unique(); - - // Configure the function to transform the input tensor from NCHW -> NHWC - _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U)); - input_perm->set_data_layout(DataLayout::NHWC); - - // Configure the function to transform the weights tensor from IHW -> HWI - _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U)); - weights_perm->set_data_layout(DataLayout::NHWC); - - output_perm->set_data_layout(DataLayout::NHWC); - output_perm->set_quantization_info(dst->quantization_info()); - - // Configure optimized depthwise - _dwc_optimized_func->configure(input_perm.get(), weights_perm.get(), biases, output_perm.get(), info); - - // Configure the function to transform the convoluted output to ACL's native ordering format NCHW - output_perm->set_data_layout(DataLayout::NHWC); - _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U)); - } - else - { - _dwc_optimized_func->configure(src, weights, biases, dst, info); - } - - // Configure activation - if(_is_activationlayer_enabled) - { - _activationlayer_function = std::make_unique(); - _activationlayer_function->configure(dst, nullptr, info.act_info); - } -} - -Status CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::validate(const ITensorInfo *src, - const ITensorInfo *weights, - const ITensorInfo *biases, - const ITensorInfo *dst, - const ConvolutionInfo &info) -{ - return validate_arguments_optimized(src, weights, biases, dst, info); -} - -void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - prepare(tensors); - - auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); - auto dst = tensors.get_tensor(TensorType::ACL_DST_0); - auto workspace = tensors.get_tensor(TensorType::ACL_INT_3); - auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4); - - // Permute input - if(_permute) - { - ITensorPack pack; - auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); - pack.add_tensor(TensorType::ACL_SRC, src); - pack.add_tensor(TensorType::ACL_DST, src_perm); - _permute_input->run(pack); - } - - // Run assembly function - if(_is_nchw) - { - auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); - auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); - auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); - - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC_0, src_perm); - pack.add_tensor(TensorType::ACL_SRC_1, weights_perm); - pack.add_tensor(TensorType::ACL_SRC_2, bias); - pack.add_tensor(TensorType::ACL_INT_0, workspace); - pack.add_tensor(TensorType::ACL_INT_1, packed_weights); - pack.add_tensor(TensorType::ACL_DST, dst_perm); - _dwc_optimized_func->run(pack); - } - else - { - auto src = tensors.get_tensor(TensorType::ACL_SRC_0); - auto weights = tensors.get_tensor(TensorType::ACL_SRC_1); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC_0, src); - pack.add_tensor(TensorType::ACL_SRC_1, weights); - pack.add_tensor(TensorType::ACL_SRC_2, bias); - pack.add_tensor(TensorType::ACL_INT_0, workspace); - pack.add_tensor(TensorType::ACL_INT_1, packed_weights); - pack.add_tensor(TensorType::ACL_DST, dst); - _dwc_optimized_func->run(pack); - } - - // Permute output - if(_is_nchw) - { - ITensorPack pack; - auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); - pack.add_tensor(TensorType::ACL_SRC, dst_perm); - pack.add_tensor(TensorType::ACL_DST, dst); - _permute_output->run(pack); - } - - // Run activation - if(_is_activationlayer_enabled) - { - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, dst); - pack.add_tensor(TensorType::ACL_DST, dst); - _activationlayer_function->run(pack); - } -} - -void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPack &tensors) -{ - if(!_is_prepared) - { - auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); - auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4); - - // Permute weights - if(_permute) - { - auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1); - - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, weights); - pack.add_tensor(TensorType::ACL_DST, permuted_weights); - _permute_weights->run(pack); - - weights->mark_as_unused(); - - ITensorPack pack_opt; - pack_opt.add_const_tensor(TensorType::ACL_SRC_1, permuted_weights); - pack_opt.add_tensor(TensorType::ACL_SRC_2, bias); - pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights); - - // Prepare optimized function - _dwc_optimized_func->prepare(pack_opt); - } - else - { - ITensorPack pack_opt; - pack_opt.add_tensor(TensorType::ACL_SRC_1, weights); - pack_opt.add_tensor(TensorType::ACL_SRC_2, bias); - pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights); - - // Prepare optimized function - _dwc_optimized_func->prepare(pack_opt); - } - - _is_prepared = true; - } -} - -void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases, - dst, info)); - - _is_nchw = src->data_layout() == DataLayout::NCHW; - _is_prepared = !_is_nchw; - - ITensorInfo *input_to_use = src; - const ITensorInfo *weights_to_use = weights; - ITensorInfo *output_to_use = dst; - - auto input_perm = std::make_unique(); - auto weights_perm = std::make_unique(); - auto output_perm = std::make_unique(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape())); - - if(_is_nchw) - { - _permute_input = std::make_unique(); - _permute_weights = std::make_unique(); - - _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U)); - input_perm->set_data_layout(DataLayout::NHWC); - input_to_use = input_perm.get(); - - _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U)); - weights_perm->set_data_layout(DataLayout::NHWC); - weights_to_use = weights_perm.get(); - - output_to_use = output_perm.get(); - } - - _depthwise_conv_kernel = std::make_unique(); - _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info); - - if(_is_nchw) - { - _permute_output = std::make_unique(); - _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U)); - output_perm->set_data_layout(DataLayout::NHWC); - } - - //Configure Activation Layer - _is_activationlayer_enabled = info.act_info.enabled(); - if(_is_activationlayer_enabled) - { - _activationlayer_function = std::make_unique(); - _activationlayer_function->configure(dst, nullptr, info.act_info); - } -} - -Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const ConvolutionInfo &info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - if(src->data_layout() == DataLayout::NCHW) - { - TensorShape permuted_input_shape = src->tensor_shape(); - TensorShape permuted_weights_shape = weights->tensor_shape(); - TensorShape permuted_output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); - permute(permuted_input_shape, PermutationVector(2U, 0U, 1U)); - permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U)); - permute(permuted_output_shape, PermutationVector(2U, 0U, 1U)); - - const TensorInfo permuted_input = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC)); - const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC)); - const TensorInfo permuted_output = TensorInfo(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW)); - - ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &permuted_input, PermutationVector(2U, 0U, 1U))); - ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U))); - ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, dst, PermutationVector(1U, 2U, 0U))); - - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info)); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info)); - } - - // Validate Activation Layer - if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info)) - { - ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info)); - } - - return Status{}; -} - -void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors) -{ - auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2); - auto dst = tensors.get_tensor(TensorType::ACL_DST_0); - - if(_is_nchw) - { - prepare(tensors); - auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); - auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); - auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); - - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, src); - pack.add_tensor(TensorType::ACL_DST, src_perm); - _permute_input->run(pack); - - ITensorPack pack_depth; - pack_depth.add_const_tensor(TensorType::ACL_SRC_0, src_perm); - pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm); - pack_depth.add_tensor(TensorType::ACL_SRC_2, biases); - pack_depth.add_tensor(TensorType::ACL_DST, dst_perm); - NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth); - } - else - { - ITensorPack pack_depth; - pack_depth.add_tensor(TensorType::ACL_SRC_0, src); - pack_depth.add_tensor(TensorType::ACL_SRC_1, weights); - pack_depth.add_tensor(TensorType::ACL_SRC_2, biases); - pack_depth.add_tensor(TensorType::ACL_DST, dst); - NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth); - } - - if(_is_nchw) - { - ITensorPack pack; - auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); - pack.add_tensor(TensorType::ACL_SRC, dst_perm); - pack.add_tensor(TensorType::ACL_DST, dst); - _permute_output->run(pack); - } - - if(_is_activationlayer_enabled) - { - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, dst); - pack.add_tensor(TensorType::ACL_DST, dst); - _activationlayer_function->run(pack); - } -} - -void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors) -{ - if(!_is_prepared) - { - auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); - - ARM_COMPUTE_ERROR_ON(!weights->is_used()); - - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, weights); - pack.add_tensor(TensorType::ACL_DST, weights_perm); - - _permute_weights->run(pack); - weights->mark_as_unused(); - _is_prepared = true; - } -} - -void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info) -{ - _depth_conv_func = get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info); - switch(_depth_conv_func) - { - case DepthwiseConvolutionFunction::OPTIMIZED: - _func_optimized.configure(src, weights, biases, dst, info); - break; - case DepthwiseConvolutionFunction::GENERIC: - _func_generic.configure(src, weights, biases, dst, info); - break; - default: - ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction"); - } -} - -Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info) -{ - DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(src, weights, biases, dst, info); - switch(depth_conv_func) - { - case DepthwiseConvolutionFunction::OPTIMIZED: - return CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info); - break; - case DepthwiseConvolutionFunction::GENERIC: - return CpuDepthwiseConv2dGeneric::validate(src, weights, biases, dst, info); - break; - default: - ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction"); - } -} - -DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const ConvolutionInfo &info) -{ - if(bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info))) - { - return DepthwiseConvolutionFunction::OPTIMIZED; - } - else - { - return DepthwiseConvolutionFunction::GENERIC; - } -} - -void CpuDepthwiseConv2d::run(ITensorPack &tensors) -{ - switch(_depth_conv_func) - { - case DepthwiseConvolutionFunction::OPTIMIZED: - _func_optimized.run(tensors); - break; - case DepthwiseConvolutionFunction::GENERIC: - _func_generic.run(tensors); - break; - default: - ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured"); - } -} - -void CpuDepthwiseConv2d::prepare(ITensorPack &tensors) -{ - switch(_depth_conv_func) - { - case DepthwiseConvolutionFunction::OPTIMIZED: - _func_optimized.prepare(tensors); - break; - case DepthwiseConvolutionFunction::GENERIC: - _func_generic.prepare(tensors); - break; - default: - ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured"); - } -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2d.h b/src/runtime/cpu/operators/CpuDepthwiseConv2d.h deleted file mode 100644 index dd4839b28a..0000000000 --- a/src/runtime/cpu/operators/CpuDepthwiseConv2d.h +++ /dev/null @@ -1,209 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H -#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H - -#include "arm_compute/core/ITensorInfo.h" -#include "arm_compute/core/experimental/Types.h" -#include "src/core/cpu/ICpuKernel.h" -#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h" -#include "src/runtime/cpu/ICpuOperator.h" -#include "src/runtime/cpu/operators/CpuActivation.h" -#include "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h" -#include "src/runtime/cpu/operators/CpuPermute.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -/** Function to execute a depthwise convolution. - */ -class CpuDepthwiseConv2d : public ICpuOperator -{ -public: - /** Default constructor */ - CpuDepthwiseConv2d() = default; - /** Initialize the function's source, destination, weights and convolution information. - * - * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32 - * @param[out] dst Destination tensor info. Data type supported: same as @p src. - * @param[in] weights Weights tensor info. These are 3D tensor infos with shape [kernel_x, kernel_y, IFM]. - * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. - * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. - * @param[in] info Depthwise convolution meta-data. - */ - void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuDepthwiseConv2d::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info); - /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConv2d - * - * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32 - * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. - * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. - * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. - * @param[in] dst Destination tensor. Data type supported: same as @p src. - * @param[in] info Depthwise convolution meta-data. - * - * @return a Depthwise Convolution Function - */ - static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const ConvolutionInfo &info); - - // Inherited methods overriden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; - -private: - /** Basic function to execute optimized depthwise convolution routines. This function calls the following kernels: - * - * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported - * - * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present - * -# @ref CpuDepthwiseConv2d3x3Kernel if 3x3 and no assembly kernel implementation is present - * -# @ref CpuDepthwiseConv2dAssemblyDispatch if assembly kernel implementation is present - * -# @ref CpuActivation if fused activation is required - * - */ - class CpuDepthwiseConv2dOptimizedInternal : public ICpuOperator - { - public: - /** Default constructor */ - CpuDepthwiseConv2dOptimizedInternal() = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuDepthwiseConv2dOptimizedInternal(const CpuDepthwiseConv2dOptimizedInternal &) = delete; - /** Default move constructor */ - CpuDepthwiseConv2dOptimizedInternal(CpuDepthwiseConv2dOptimizedInternal &&) = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuDepthwiseConv2dOptimizedInternal &operator=(const CpuDepthwiseConv2dOptimizedInternal &) = delete; - /** Default move assignment operator */ - CpuDepthwiseConv2dOptimizedInternal &operator=(CpuDepthwiseConv2dOptimizedInternal &&) = default; - /** Default destructor */ - ~CpuDepthwiseConv2dOptimizedInternal() = default; - /** Initialize the function's source, destination, kernels and border_size. - * - * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling). - * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p src. - * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. - * @param[out] dst Destination tensor info. Data type supported: same as @p src. - * @param[in] info Depthwise convolution meta-data. - */ - void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuDepthwiseConv2dOptimizedInternal::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info); - - // Inherited methods overriden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; - - private: - std::unique_ptr _dwc_optimized_func{ nullptr }; - std::unique_ptr _permute_input{ nullptr }; - std::unique_ptr _permute_weights{ nullptr }; - std::unique_ptr _permute_output{ nullptr }; - std::unique_ptr _activationlayer_function{ nullptr }; - bool _has_bias{ false }; - bool _is_quantized{ false }; - bool _is_nchw{ true }; - bool _permute{ false }; - bool _is_activationlayer_enabled{ false }; - bool _is_prepared{ false }; - }; - - /** Basic function to execute a generic depthwise convolution. This function calls the following kernel: - * - * -# @ref CpuDepthwiseConv2dNativeKernel - * - */ - class CpuDepthwiseConv2dGeneric : public ICpuOperator - { - public: - /** Default constructor */ - CpuDepthwiseConv2dGeneric() = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuDepthwiseConv2dGeneric(const CpuDepthwiseConv2dGeneric &) = delete; - /** Default move constructor */ - CpuDepthwiseConv2dGeneric(CpuDepthwiseConv2dGeneric &&) = default; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuDepthwiseConv2dGeneric &operator=(const CpuDepthwiseConv2dGeneric &) = delete; - /** Default move assignment operator */ - CpuDepthwiseConv2dGeneric &operator=(CpuDepthwiseConv2dGeneric &&) = default; - /** Default destructor */ - ~CpuDepthwiseConv2dGeneric() = default; - /** Initialize the function's source, destination, weights and convolution information. - * - * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling). - * @param[out] dst Destination tensor info. Data type supported: same as @p src. - * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. - * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. - * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. - * @param[in] info Depthwise convolution meta-data. - */ - void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info); - - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuDepthwiseConv2dGeneric::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; - - private: - std::unique_ptr _depthwise_conv_kernel{ nullptr }; - std::unique_ptr _permute_input{ nullptr }; - std::unique_ptr _permute_weights{ nullptr }; - std::unique_ptr _permute_output{ nullptr }; - std::unique_ptr _activationlayer_function{ nullptr }; - bool _is_nchw{ true }; - bool _is_prepared{ false }; - bool _is_activationlayer_enabled{ false }; - }; - - DepthwiseConvolutionFunction _depth_conv_func{ DepthwiseConvolutionFunction::GENERIC }; - CpuDepthwiseConv2dOptimizedInternal _func_optimized{}; - CpuDepthwiseConv2dGeneric _func_generic{}; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H */ diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp deleted file mode 100644 index 660ac0163c..0000000000 --- a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h" - -#include "arm_compute/core/ITensorInfo.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/CPP/Validate.h" -#include "src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/utils/AssemblyUtils.h" - -namespace arm_compute -{ -namespace cpu -{ -struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl -{ - std::unique_ptr asm_kernel{ nullptr }; - bool is_prepared{ false }; - experimental::MemoryRequirements mem_req{}; -}; - -#ifndef DOXYGEN_SKIP_THIS -CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch() - : _pImpl(std::make_unique()) -{ -} -#endif /* DOXYGEN_SKIP_THIS */ - -CpuDepthwiseConv2dAssemblyDispatch::~CpuDepthwiseConv2dAssemblyDispatch() = default; - -void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src, - const ITensorInfo *weights, - const ITensorInfo *bias, - ITensorInfo *dst, - const ConvolutionInfo &info) -{ - const CPUInfo &ci = NEScheduler::get().cpu_info(); - const unsigned int num_threads = NEScheduler::get().num_threads(); - _pImpl->is_prepared = false; - - // If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured() - if(!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info)) - { - return; - } - - auto dwc_wrapper = std::make_unique(); - ARM_COMPUTE_ERROR_ON(dwc_wrapper == nullptr); - dwc_wrapper->configure(src, weights, bias, dst, info, ci); - - // Compute memory requirements for assembly kernels - constexpr size_t alignment = 4096; - _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads, src->dimension(0)), alignment }); - _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment }); - _pImpl->asm_kernel = std::move(dwc_wrapper); -} - -Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info) -{ - return kernels::CpuDepthwiseConv2dAssemblyWrapperKernel::validate(src, weights, bias, dst, info); -} - -experimental::MemoryRequirements CpuDepthwiseConv2dAssemblyDispatch::workspace() const -{ - return _pImpl->mem_req; -} - -bool CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation) -{ - arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(activation); - return act.type != arm_gemm::Activation::Type::None; -} - -void CpuDepthwiseConv2dAssemblyDispatch::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - - prepare(tensors); - - NEScheduler::get().schedule_op(_pImpl->asm_kernel.get(), Window::DimY, _pImpl->asm_kernel->window(), tensors); -} - -void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors) -{ - if(!_pImpl->is_prepared) - { - // Pack weights and bias - const ITensor *weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - const ITensor *bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); - ITensor *storage = tensors.get_tensor(TensorType::ACL_INT_1); - - const auto weights_ptr = weights->buffer() + weights->info()->offset_first_element_in_bytes(); - const auto bias_ptr = (bias) ? bias->buffer() + bias->info()->offset_first_element_in_bytes() : nullptr; - auto parameters_ptr = storage->buffer() + storage->info()->offset_first_element_in_bytes(); - - const auto weights_shape = weights->info()->tensor_shape(); - const auto weights_padding = weights->info()->padding(); - - const size_t ld_weights_col = weights_shape[0] + weights_padding.left + weights_padding.right; - const size_t ld_weights_row = ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom); - _pImpl->asm_kernel->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weights_row); - - weights->mark_as_unused(); - if(bias != nullptr) - { - bias->mark_as_unused(); - } - _pImpl->is_prepared = true; - } -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h b/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h deleted file mode 100644 index f3d3b618c6..0000000000 --- a/src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2019-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H -#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H - -#include "src/core/common/Macros.h" -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Depthwise convolution assembly kernel glue */ -class CpuDepthwiseConv2dAssemblyDispatch : public ICpuOperator -{ -public: - CpuDepthwiseConv2dAssemblyDispatch(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dAssemblyDispatch); - ~CpuDepthwiseConv2dAssemblyDispatch(); - /** Initialize the function's source, destination, kernels and border_size. - * - * @note Supports only NHWC format - * - * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. - * Data type supported: same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. - * @param[in] bias (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. - * Data type supported: same as @p src or S32 if @p src is quantized. - * @param[out] dst Destination tensor info. Data type supported: same as @p src. - * @param[in] info Depthwise convolution meta-data. - */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuDepthwiseConv2dAssemblyDispatch::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info); - /** Checks if activation is supported by the assembly kernels - * - * @param[in] activation Activation to check - * - * @return True if activation is supported else false - */ - static bool is_activation_supported(const ActivationLayerInfo &activation); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; - experimental::MemoryRequirements workspace() const override; - -private: - struct LocalImpl; - std::unique_ptr _pImpl; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H */ diff --git a/src/runtime/cpu/operators/CpuDequantize.cpp b/src/runtime/cpu/operators/CpuDequantize.cpp deleted file mode 100644 index 80a2e28aee..0000000000 --- a/src/runtime/cpu/operators/CpuDequantize.cpp +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuDequantize.h" - -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/cpu/kernels/CpuDequantizeKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuDequantize::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique(); - k->configure(src, dst); - _kernel = std::move(k); -} - -Status CpuDequantize::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::CpuDequantizeKernel::validate(src, dst); -} - -void CpuDequantize::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - prepare(tensors); - NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuDequantize.h b/src/runtime/cpu/operators/CpuDequantize.h deleted file mode 100644 index fdbd6a57c2..0000000000 --- a/src/runtime/cpu/operators/CpuDequantize.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_DEQUANTIZE_H -#define ARM_COMPUTE_CPU_DEQUANTIZE_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuDequantizeKernel that dequantizes an input tensor */ -class CpuDequantize : public ICpuOperator -{ -public: - /** Configure the kernel. - * - * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16. - * @param[out] dst Destination tensor info with the same dimensions of input. Data type supported: F16/F32. - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuDequantize::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_DEQUANTIZE_H */ diff --git a/src/runtime/cpu/operators/CpuDirectConv2d.cpp b/src/runtime/cpu/operators/CpuDirectConv2d.cpp deleted file mode 100644 index 8812b777a3..0000000000 --- a/src/runtime/cpu/operators/CpuDirectConv2d.cpp +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuDirectConv2d.h" - -#include "arm_compute/core/PixelValue.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - -namespace arm_compute -{ -namespace cpu -{ -CpuDirectConv2d::~CpuDirectConv2d() = default; - -CpuDirectConv2d::CpuDirectConv2d(std::shared_ptr memory_manager) - : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false), - _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required() -{ -} - -void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); - _output_stage_kernel = std::make_unique(); - _conv_kernel = std::make_unique(); - _input_border_handler = std::make_unique(); - - // Free accumulator - if(_accumulator.buffer() != nullptr) - { - _accumulator.allocator()->free(); - } - - _dim_split = src->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY; - - // Check if bias should be added in the convolution result - _has_bias = (bias != nullptr); - - _conv_kernel->configure(src, weights, dst, conv_info); - if(_has_bias) - { - _output_stage_kernel->configure(dst, bias); - } - _is_padding_required = !_conv_kernel->border_size().empty(); - - if(_is_padding_required) - { - // Add zero padding XY - _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast(0.f))); - } - - //Configure Activation Layer - _is_activationlayer_enabled = act_info.enabled(); - if(_is_activationlayer_enabled) - { - _activationlayer_function = std::make_unique(); - _activationlayer_function->configure(dst, dst, act_info); - } -} - -Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - - // output might not be initialized since it can be an intermediate tensor of another layer - DataType data_type = src->data_type(); - TensorInfo accumulator(dst->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type)); - - // Validate Convolution kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dKernel::validate(src, weights, &accumulator, conv_info)); - - if(bias != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3), - "Biases size and number of input feature maps should match"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional"); - } - - // Validate bias kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dOutputStageKernel::validate(&accumulator, bias, dst)); - - if(act_info.enabled()) - { - ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info)); - } - - return Status{}; -} - -void CpuDirectConv2d::run(ITensorPack &tensors) -{ - MemoryGroupResourceScope scope_mg(_memory_group); - - auto src = tensors.get_tensor(TensorType::ACL_SRC_0); - auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - if(_is_padding_required) - { - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC_DST, src); - NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), pack); - } - NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors); - if(_has_bias) - { - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC_0, dst); - pack.add_tensor(TensorType::ACL_SRC_1, bias); - pack.add_tensor(TensorType::ACL_DST, dst); - NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack); - } - - if(_is_activationlayer_enabled) - { - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC, dst); - pack.add_tensor(TensorType::ACL_DST, dst); - _activationlayer_function->run(pack); - } -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuDirectConv2d.h b/src/runtime/cpu/operators/CpuDirectConv2d.h deleted file mode 100644 index c17b076f85..0000000000 --- a/src/runtime/cpu/operators/CpuDirectConv2d.h +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_H -#define ARM_COMPUTE_CPU_DIRECTCONV2D_H - -#include "arm_compute/core/ITensorInfo.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/experimental/Types.h" -#include "arm_compute/runtime/IMemoryManager.h" -#include "arm_compute/runtime/MemoryGroup.h" -#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" -#include "arm_compute/runtime/Tensor.h" -#include "src/core/NEON/kernels/NEFillBorderKernel.h" -#include "src/core/cpu/ICpuKernel.h" -#include "src/core/cpu/kernels/CpuDirectConv2dKernel.h" -#include "src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h" -#include "src/runtime/cpu/ICpuOperator.h" -#include "src/runtime/cpu/operators/CpuActivation.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -/** Function to run the direct convolution. - * - * This function calls the following kernels: - * - * -# @ref NEFillBorderKernel for the input - * -# @ref kernels::CpuDirectConv2dOutputStageKernel - * -# @ref kernels::CpuDirectConv2dKernel - */ -class CpuDirectConv2d : public ICpuOperator -{ -public: - CpuDirectConv2d(std::shared_ptr memory_manager = nullptr); - ~CpuDirectConv2d(); - /** Set the input, weights, biases and output tensors. - * - * @note: DirectConvolution only works in the following configurations: - * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32 - * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32 - * 5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32 - * - * @param[in, out] src Input tensor info. Data types supported: F16/F32. - * @param[in] weights Set of kernels to convolve the input volume. - * Supported sizes: 1x1, 3x3 and 5x5. - * The 3rd dimension must be the same as the input's volume 3rd dimension. - * Data type supported: Same as @p src. - * @param[in] bias Set of biases. Can be nullptr. Data type supported: Same as @p src. - * @param[out] dst Output tensor info. - * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - */ - void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuDirectConv2d::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - -private: - MemoryGroup _memory_group; - std::unique_ptr _output_stage_kernel; - std::unique_ptr _conv_kernel; - std::unique_ptr _input_border_handler; - std::unique_ptr _activationlayer_function; - Tensor _accumulator; - bool _has_bias{ false }; - bool _is_activationlayer_enabled{ false }; - unsigned int _dim_split{ 0 }; - bool _is_padding_required{ false }; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_DIRECTCONV2D_H */ diff --git a/src/runtime/cpu/operators/CpuElementwise.cpp b/src/runtime/cpu/operators/CpuElementwise.cpp deleted file mode 100644 index 8953d4769c..0000000000 --- a/src/runtime/cpu/operators/CpuElementwise.cpp +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuElementwise.h" -#include "src/core/cpu/kernels/CpuElementwiseKernel.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuElementwiseBase::run(ITensorPack &tensors) -{ - // If the kernel has been configured, use the window from the kernel. - if(_kernel->is_window_configured()) - { - ICpuOperator::run(tensors); - return; - } - - auto src0_info = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info(); - auto src1_info = tensors.get_const_tensor(TensorType::ACL_SRC_1)->info(); - auto shape_and_window = compute_output_shape_and_window(src0_info->tensor_shape(), src1_info->tensor_shape()); - ICpuOperator::run(tensors, shape_and_window.second); -} - -template -void CpuElementwiseArithmetic::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) -{ - auto k = std::make_unique(); - k->configure(op, src0, src1, dst); - _kernel = std::move(k); -} - -template -Status CpuElementwiseArithmetic::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) -{ - return kernels::CpuArithmeticKernel::validate(op, src0, src1, dst); -} - -template class CpuElementwiseArithmetic; -template class CpuElementwiseArithmetic; -template class CpuElementwiseArithmetic; -template class CpuElementwiseArithmetic; - -void CpuElementwiseDivision::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) -{ - auto k = std::make_unique(); - k->configure(src0, src1, dst); - _kernel = std::move(k); -} - -Status CpuElementwiseDivision::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) -{ - return kernels::CpuDivisionKernel::validate(src0, src1, dst); -} - -void CpuElementwisePower::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) -{ - auto k = std::make_unique(); - k->configure(src0, src1, dst); - _kernel = std::move(k); -} - -Status CpuElementwisePower::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) -{ - return kernels::CpuPowerKernel::validate(src0, src1, dst); -} - -template -void CpuElementwiseComparisonStatic::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) -{ - auto k = std::make_unique(); - k->configure(COP, src0, src1, dst); - _kernel = std::move(k); -} - -template -Status CpuElementwiseComparisonStatic::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) -{ - return kernels::CpuComparisonKernel::validate(COP, src0, src1, dst); -} - -void CpuElementwiseComparison::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op) -{ - auto k = std::make_unique(); - k->configure(op, src0, src1, dst); - _kernel = std::move(k); -} - -Status CpuElementwiseComparison::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op) -{ - return kernels::CpuComparisonKernel::validate(op, src0, src1, dst); -} - -// Supported Specializations -template class CpuElementwiseComparisonStatic; -template class CpuElementwiseComparisonStatic; -template class CpuElementwiseComparisonStatic; -template class CpuElementwiseComparisonStatic; -template class CpuElementwiseComparisonStatic; -template class CpuElementwiseComparisonStatic; -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuElementwise.h b/src/runtime/cpu/operators/CpuElementwise.h deleted file mode 100644 index ef5caf2825..0000000000 --- a/src/runtime/cpu/operators/CpuElementwise.h +++ /dev/null @@ -1,185 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_H -#define ARM_COMPUTE_CPU_ELEMENTWISE_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -class CpuElementwiseBase : public ICpuOperator -{ -public: - // Inherited methods overridden: - void run(ITensorPack &tensors) override; -}; -/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for division and power - * - * @note Max/Min/Squared difference supports input data type of QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32 - * @note PRelu supports inpute data type of QASYMM8/QASYMM8_SIGNED/F16/F32. - */ -template -class CpuElementwiseArithmetic : public CpuElementwiseBase -{ -public: - /** Configure the operator - * - * @param[in] src0 The first source tensor information. - * @param[in] src1 The second source tensor information. With PRelu, this is used as alpha tensor. - * @param[out] dst The output tensor information. - */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuElementwiseArithmetic::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); -}; - -/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for maximum operation */ -using CpuElementwiseMax = CpuElementwiseArithmetic; -/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for minimum operation */ -using CpuElementwiseMin = CpuElementwiseArithmetic; -/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for squared difference operation */ -using CpuElementwiseSquaredDiff = CpuElementwiseArithmetic; - -/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for division - * - * @note The tensor data type for the inputs must be S32/F16/F32. - * @note The function performs a division operation between two tensors (i.e., out[i] = in1[i] / in2[i]) - */ -class CpuElementwiseDivision : public CpuElementwiseBase -{ -public: - /** Initialise the kernel's inputs, dst and conversion policy. - * - * @param[in, out] src0 First tensor input info. Data types supported: S32/F16/F32. - * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0. - * @param[out] dst Output tensor info. Data types supported: Same as @p src0. - */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuElementwiseDivision::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); -}; - -/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for power - * - * @note The tensor data type for the inputs must be F16/F32. - * @note The function performs a elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i]) - * @note For an exponent that is a float, this function will only work with a positive base. - */ -class CpuElementwisePower : public CpuElementwiseBase -{ -public: - /** Initialise the kernel's inputs, dst and conversion policy. - * - * @param[in, out] src0 First tensor input info. Data types supported: F16/F32. - * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0. - * @param[out] dst Output tensor info. Data types supported: Same as @p src0. - */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuElementwisePower::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); -}; - -/** Basic function to run @ref cpu::kernels::CpuComparisonKernel. - * - * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. - * @note The function performs a comparison operation between two tensors. - */ -class CpuElementwiseComparison : public CpuElementwiseBase -{ -public: - /** Initialise the kernel's inputs, dst and conversion policy. - * - * @param[in, out] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. - * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0. - * @param[out] dst Output tensor info. Data types supported: U16/U32. - * @param[in] op Comparison Operation to be performed. - */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuElementwiseComparison::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op); -}; - -/** Basic function to run @ref cpu::kernels::CpuComparisonKernel - * - * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. - * @note The function performs a comparison operation between two tensors. - */ -template -class CpuElementwiseComparisonStatic : public CpuElementwiseBase -{ -public: - /** Initialise the kernel's inputs, dst and conversion policy. - * - * @param[in, out] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. - * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0. - * @param[out] dst Output tensor info. Data types supported: U16/U32. - */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuElementwiseComparisonStatic::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); -}; - -/** Basic function to run equal comparison. */ -using NEEqual = CpuElementwiseComparisonStatic; -/** Basic function to run not equal comparison. */ -using NENotEqual = CpuElementwiseComparisonStatic; -/** Basic function to run greater comparison. */ -using NEGreater = CpuElementwiseComparisonStatic; -/** Basic function to run greater-equal comparison. */ -using NEGreaterEqual = CpuElementwiseComparisonStatic; -/** Basic function to run less comparison. */ -using NELess = CpuElementwiseComparisonStatic; -/** Basic function to run less-equal comparison. */ -using NELessEqual = CpuElementwiseComparisonStatic; -} // namespace cpu -} // namespace arm_compute - -#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */ \ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuElementwiseUnary.cpp b/src/runtime/cpu/operators/CpuElementwiseUnary.cpp deleted file mode 100644 index c79e6e9acf..0000000000 --- a/src/runtime/cpu/operators/CpuElementwiseUnary.cpp +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuElementwiseUnary.h" -#include "src/core/cpu/kernels/CpuElementwiseUnaryKernel.h" -#include "src/core/helpers/WindowHelpers.h" - -namespace arm_compute -{ -namespace cpu -{ -using KernelType = kernels::CpuElementwiseUnaryKernel; - -void CpuElementwiseUnary::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst) -{ - auto k = std::make_unique(); - k->configure(op, src, dst); - _kernel = std::move(k); -} - -Status CpuElementwiseUnary::validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst) -{ - return KernelType::validate(op, src, dst); -} - -void CpuElementwiseUnary::run(ITensorPack &tensors) -{ - if(_kernel->is_window_configured()) - { - ICpuOperator::run(tensors); - return; - } - - auto src_info = tensors.get_const_tensor(TensorType::ACL_SRC)->info(); - ICpuOperator::run(tensors, compute_output_shape_and_window(src_info->tensor_shape()).second); -} -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuElementwiseUnary.h b/src/runtime/cpu/operators/CpuElementwiseUnary.h deleted file mode 100644 index 5ea29e07e9..0000000000 --- a/src/runtime/cpu/operators/CpuElementwiseUnary.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H -#define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H - -#include "arm_compute/core/Types.h" -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -class CpuElementwiseUnary : public ICpuOperator -{ -public: - /** Initialize the function - * - * @param[in] op Unary operation to execute - * @param[in] src Input tensor information. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations. - * @param[out] dst Output tensor information. Data types supported: Same as @p src. - */ - void configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuElementwiseUnary::configure() - * - * @return a status - */ - static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; -}; - -} // namespace cpu -} // namespace arm_compute - -#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */ \ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuFill.cpp b/src/runtime/cpu/operators/CpuFill.cpp deleted file mode 100644 index 081e30ea17..0000000000 --- a/src/runtime/cpu/operators/CpuFill.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuFill.h" - -#include "src/core/cpu/kernels/CpuFillKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuFill::configure(const ITensorInfo *tensor, PixelValue constant_value) -{ - auto k = std::make_unique(); - k->configure(tensor, constant_value); - _kernel = std::move(k); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuFill.h b/src/runtime/cpu/operators/CpuFill.h deleted file mode 100644 index b946467da6..0000000000 --- a/src/runtime/cpu/operators/CpuFill.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_FILL_H -#define ARM_COMPUTE_CPU_FILL_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuFillKernel */ -class CpuFill : public ICpuOperator -{ -public: - /** Configure operator for a given list of arguments - * - * @param[in,out] tensor Tensor to fill. Supported data types: All - * @param[in] constant_value The value used to fill the planes of the tensor - */ - void configure(const ITensorInfo *tensor, PixelValue constant_value); -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_FILL_H */ diff --git a/src/runtime/cpu/operators/CpuFlatten.cpp b/src/runtime/cpu/operators/CpuFlatten.cpp deleted file mode 100644 index 58e6e4b671..0000000000 --- a/src/runtime/cpu/operators/CpuFlatten.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuFlatten.h" - -#include "src/core/cpu/kernels/CpuReshapeKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuFlatten::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique(); - k->configure(src, dst); - _kernel = std::move(k); -} - -Status CpuFlatten::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::CpuReshapeKernel::validate(src, dst); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuFlatten.h b/src/runtime/cpu/operators/CpuFlatten.h deleted file mode 100644 index 3e24a93429..0000000000 --- a/src/runtime/cpu/operators/CpuFlatten.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_FLATTEN_H -#define ARM_COMPUTE_CPU_FLATTEN_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to flatten a given input */ -class CpuFlatten : public ICpuOperator -{ -public: - /** Configure operator for a given list of arguments - * - * Valid data layouts: - * - All - * - * Valid data type configurations: - * |src |dst | - * |:--------------|:--------------| - * |All |All | - * - * @param[in] src Source tensor to flatten with at least 3 dimensions. - * The dimensions above the third will be interpreted as batches. Data types supported: All - * @param[in] dst Destination tensor with shape [w*h*d, input_batches] where: - * w = width input tensor, h = height input tensor and d = depth input tensor. - * Data type supported: same as @p src - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuFlatten::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_FLATTEN_H */ diff --git a/src/runtime/cpu/operators/CpuFloor.cpp b/src/runtime/cpu/operators/CpuFloor.cpp deleted file mode 100644 index 4e169a04be..0000000000 --- a/src/runtime/cpu/operators/CpuFloor.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuFloor.h" - -#include "src/core/cpu/kernels/CpuFloorKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuFloor::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique(); - k->configure(src, dst); - _kernel = std::move(k); -} - -Status CpuFloor::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::CpuFloorKernel::validate(src, dst); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuFloor.h b/src/runtime/cpu/operators/CpuFloor.h deleted file mode 100644 index 0cd0cc0b4e..0000000000 --- a/src/runtime/cpu/operators/CpuFloor.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_FLOOR_H -#define ARM_COMPUTE_CPU_FLOOR_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuFloorKernel */ -class CpuFloor : public ICpuOperator -{ -public: - /** Configure operator for a given list of arguments - * - * @param[in] src Source tensor info. Data types supported: F16/F32. - * @param[in] dst Destination tensor info. Data type supported: same as @p src - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuFloor::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_FLOOR_H */ diff --git a/src/runtime/cpu/operators/CpuFullyConnected.cpp b/src/runtime/cpu/operators/CpuFullyConnected.cpp deleted file mode 100644 index eeabce0753..0000000000 --- a/src/runtime/cpu/operators/CpuFullyConnected.cpp +++ /dev/null @@ -1,496 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuFullyConnected.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/cpu/kernels/CpuTransposeKernel.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/MemoryHelpers.h" -#include "src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.h" -#include "src/runtime/cpu/operators/CpuFlatten.h" -#include "src/runtime/cpu/operators/CpuGemm.h" -#include "src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h" -#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h" - -namespace arm_compute -{ -namespace cpu -{ -using namespace arm_compute::experimental; -using namespace arm_compute::misc::shape_calculator; - -namespace -{ -// Get min, max bound of a quantized asymmetric dst tensor, with the effect of fused activation -std::pair get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, const ActivationLayerInfo &act_info, DataType data_type) -{ - PixelValue type_min{}; - PixelValue type_max{}; - std::tie(type_min, type_max) = get_min_max(data_type); - const UniformQuantizationInfo q_unif = q_info.uniform(); - - if(act_info.enabled()) - { - switch(act_info.activation()) - { - case ActivationLayerInfo::ActivationFunction::RELU: - type_min = PixelValue(q_unif.offset); - break; - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - type_min = PixelValue(q_unif.offset); - type_max = PixelValue(act_info.a(), data_type, q_info); - break; - case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU: - type_min = PixelValue(act_info.b(), data_type, q_info); - type_max = PixelValue(act_info.a(), data_type, q_info); - break; - default: - ARM_COMPUTE_ERROR("Activation function not supported."); - break; - } - } - - return std::make_pair(type_min, type_max); -} - -Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act, - GEMMLowpOutputStageInfo &gemmlowp_output_stage_info) -{ - const auto data_type = src->data_type(); - const QuantizationInfo oq_info = dst->quantization_info(); - const UniformQuantizationInfo iq_unif = src->quantization_info().uniform(); - const UniformQuantizationInfo wq_unif = weights->quantization_info().uniform(); - const UniformQuantizationInfo oq_unif = oq_info.uniform(); - - float multiplier = (iq_unif.scale * wq_unif.scale) / oq_unif.scale; - int32_t output_multiplier; - int32_t output_shift; - - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); - - PixelValue type_min{}; - PixelValue type_max{}; - std::tie(type_min, type_max) = get_quantized_asymmetric_output_min_max(oq_info, act, data_type); - - gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier; - gemmlowp_output_stage_info.gemmlowp_shift = output_shift; - gemmlowp_output_stage_info.gemmlowp_offset = oq_unif.offset; - gemmlowp_output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - gemmlowp_output_stage_info.gemmlowp_min_bound = type_min.get(); - gemmlowp_output_stage_info.gemmlowp_max_bound = type_max.get(); - - return Status{}; -} - -Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act) -{ - if(is_data_type_quantized_asymmetric(src->data_type())) - { - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate src and weights offset - const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset); - const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset); - - GEMMLowpOutputStageInfo gemmlowp_output_stage_info; - ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(src, weights, dst, act, gemmlowp_output_stage_info)); - - GEMMInfo gemm_info; - gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info); - - // Validate gemmlowp function - TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info); - TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info); - ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmLowpMatrixMultiplyCore::validate(&src_info, - &weights_info, - biases, - dst, - gemm_info)); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR(CpuGemm::validate(src, weights, biases, dst, 1.f, 1.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */))); - } - - return Status{}; -} -} // namespace - -CpuFullyConnected::CpuFullyConnected() - : _flatten(nullptr), - _convert_weights(nullptr), - _transpose_weights(nullptr), - _mm_gemm(nullptr), - _mm_gemmlowp(nullptr), - _flattened_src(), - _converted_weights(), - _reshaped_weights(), - _trans_weights(), - _trans_weights_idx(AuxTensorIdx::Count), - _aux_mem(Count), - _needs_weights_conversion(false), - _needs_weights_reshape(false), - _is_fc_after_conv(false), - _is_quantized_asymmetric(false), - _is_prepared(false) - -{ -} - -CpuFullyConnected::~CpuFullyConnected() = default; - -void CpuFullyConnected::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act) -{ - if(_is_quantized_asymmetric) - { - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate src and weights offset - const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset); - const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset); - - TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info); - TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info); - - // Configure gemmlowp function and output stage for asymmetric quantized types - GEMMLowpOutputStageInfo gemmlowp_output_stage_info; - const Status status = get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info); - ARM_COMPUTE_ERROR_ON(status.error_code() != ErrorCode::OK); - - GEMMInfo gemm_info; - gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info); - gemm_info.set_activation_info(act); - _mm_gemmlowp = std::make_unique(); - _mm_gemmlowp->configure(&src_info, &weights_info, biases, dst, gemm_info); - } - else - { - // Configure matrix multiply kernel - GEMMInfo gemm_info(false, false, true /* Reshape weights only for the first run */); - gemm_info.set_activation_info(act); - _mm_gemm = std::make_unique(); - _mm_gemm->configure(src, weights, biases, dst, 1.f, 1.0f, gemm_info); - } -} - -void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act) -{ - ARM_COMPUTE_ERROR_ON((weights->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); - - // If the fully connected layer is called after a convolution layer, the src tensor must be linearized - - // Initialize output tensor for flatten - auto_init_if_empty(_flattened_src, src->clone()->set_tensor_shape(compute_flatten_shape(src))); - - _flatten = std::make_unique(); - _flatten->configure(src, &_flattened_src); - - // Configure matrix multiply kernel - configure_mm(&_flattened_src, weights, biases, dst, act); -} - -void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act) -{ - ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension(1)); - - // Configure matrix multiply kernel - configure_mm(src, weights, biases, dst, act); -} - -void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, - FullyConnectedLayerInfo fc_info) -{ - // Perform validate step - ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(CpuFullyConnected::validate(src, - weights, - biases != nullptr ? biases : nullptr, - dst, - fc_info)); - - _needs_weights_conversion = false; - _needs_weights_reshape = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false; - _needs_weights_reshape = _needs_weights_reshape && !fc_info.retain_internal_weights; - _is_fc_after_conv = true; - _is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type()); - _is_prepared = false; - _trans_weights_idx = AuxTensorIdx::Count; - - // With the Fully Connected layer we can have 4 different cases: - // 1) Convolution layer -> Fully Connected layer without batches - // 2) Fully Connected layer -> Fully Connected layer without batches - // 3) Convolution layer -> Fully Connected layer with batches - // 4) Fully Connected layer -> Fully Connected layer with batches - - const ITensorInfo *weights_to_use = weights; - - // Check if we have a fully connected layer with batches - const bool is_batched_fc_layer = dst->dimension(1) > 1; - if(is_batched_fc_layer) - { - _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, - src->tensor_shape().cend(), - dst->tensor_shape().cbegin() + 1)); - } - else - { - _is_fc_after_conv = src->num_dimensions() > 1; - } - - // Reshape weights if needed - if(_needs_weights_reshape) - { - // Reshape the weights - _transpose_weights = std::make_unique(); - _transpose_weights->configure(weights, &_reshaped_weights); - weights_to_use = &_reshaped_weights; - _trans_weights_idx = AuxTensorIdx::TransposedWeights; - } - - // Convert weights if needed - if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) - { - // Convert weights - _convert_weights = std::make_unique(); - _convert_weights->configure(weights_to_use, - &_converted_weights, - src->tensor_shape(), - fc_info.weights_trained_layout); - - weights_to_use = &_converted_weights; - _needs_weights_conversion = true; - _trans_weights_idx = AuxTensorIdx::ConvertedWeights; - } - - if(_is_fc_after_conv) - { - // Fully Connected layer after a Convolution Layer without batches - configure_conv_fc(src, weights_to_use, biases, dst, fc_info.activation_info); - } - else - { - // Fully Connected layer after a Fully Connected Layer without batches - configure_fc_fc(src, weights_to_use, biases, dst, fc_info.activation_info); - } - - // Retain the tensorinfo with the weights to use - if(_needs_weights_reshape || _needs_weights_conversion) - { - _trans_weights = *weights_to_use; - } - - // Set auxiliary memory requirements - auto gemm_mem_req = (_is_quantized_asymmetric) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace(); - for(unsigned int i = 0; i < gemm_mem_req.size(); ++i) - { - _aux_mem[i] = gemm_mem_req[i]; - } - - if(_aux_mem[Pretranspose].size > 0) - { - // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch - _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), MemoryLifetime::Prepare, _reshaped_weights.total_size()); - _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Prepare, _converted_weights.total_size()); - } - else - { - _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), _needs_weights_conversion ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, _reshaped_weights.total_size()); - _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Persistent, _converted_weights.total_size()); - } - _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size()); -} - -Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - FullyConnectedLayerInfo fc_info) -{ - ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(biases != nullptr && biases->num_dimensions() > 1); - ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU - && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!fc_info.constant_weights, "Non-constant weights are currently not supported"); - - bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; - bool is_fc_after_conv = true; - - const ITensorInfo &flatten_src = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src))); - const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); - const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone()); - - // With the Fully Connected layer we can have 4 different cases: - // 1) Convolution layer -> Fully Connected layer without batches - // 2) Fully Connected layer -> Fully Connected layer without batches - // 3) Convolution layer -> Fully Connected layer with batches - // 4) Fully Connected layer -> Fully Connected layer with batches - - const ITensorInfo *src_to_use = src; - const ITensorInfo *weights_to_use = weights; - - // Check if we have a fully connected layer with batches - const bool is_batched_fc_layer = dst->dimension(1) > 1; - - if(is_batched_fc_layer) - { - is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, - src->tensor_shape().cend(), - dst->tensor_shape().cbegin() + 1)); - } - else - { - is_fc_after_conv = src->num_dimensions() > 1; - } - - if(!weights_reshaped) - { - // Validate reshape weights kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuTransposeKernel::validate(weights, &reshaped_weights)); - weights_to_use = &reshaped_weights; - } - - if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) - { - // Validate convert weights kernel - ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate(weights_to_use, - &converted_weights, - src->tensor_shape(), - fc_info.weights_trained_layout)); - weights_to_use = &converted_weights; - } - - if(is_fc_after_conv) - { - // Fully Connected layer after a Convolution Layer without batches - ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); - - // Validate flatten kernel - ARM_COMPUTE_RETURN_ON_ERROR(CpuFlatten::validate(src, &flatten_src)); - src_to_use = &flatten_src; - } - else - { - // Fully Connected layer after a Fully Connected Layer without batches - ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(1)); - } - // Validate matrix multiply kernel - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info)); - - return Status{}; -} - -void CpuFullyConnected::run(ITensorPack &tensors) -{ - prepare(tensors); - - auto src = tensors.get_const_tensor(ACL_SRC_0); - - CpuAuxTensorHandler flattened_src(offset_int_vec(FlattenedSrc), _flattened_src, tensors, false); - CpuAuxTensorHandler transformed_wei(offset_int_vec(_trans_weights_idx), _trans_weights, tensors, false); - - // Linearize src if it comes from a convolutional layer - if(_is_fc_after_conv) - { - ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } }; - _flatten->run(flatten_pack); - } - - ITensorPack gemm_pack = tensors; - gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src); - if(_needs_weights_reshape || _needs_weights_conversion) - { - gemm_pack.add_const_tensor(ACL_SRC_1, transformed_wei.get()); - } - - // Run matrix multiply - if(_is_quantized_asymmetric) - { - _mm_gemmlowp->run(gemm_pack); - } - else - { - _mm_gemm->run(gemm_pack); - } -} - -void CpuFullyConnected::prepare(ITensorPack &tensors) -{ - if(!_is_prepared) - { - auto weights = tensors.get_const_tensor(ACL_SRC_1); - - CpuAuxTensorHandler reshaped_weights(offset_int_vec(TransposedWeights), _reshaped_weights, tensors, false); - CpuAuxTensorHandler converted_weights(offset_int_vec(ConvertedWeights), _converted_weights, tensors, false); - - // Pointer to current weights - const ITensor *cur_weights = weights; - - // Reshape of the weights (happens only once) - if(_needs_weights_reshape) - { - // Run reshape weights kernel and mark weights as unused - ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } }; - NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(), transpose_pack); - - cur_weights->mark_as_unused(); - cur_weights = reshaped_weights.get(); - } - - // Convert weights if needed (happens only once) - if(_needs_weights_conversion) - { - ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } }; - _convert_weights->run(convert_pack); - - cur_weights->mark_as_unused(); - cur_weights = converted_weights.get(); - } - - ITensorPack gemm_pack = tensors; - gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights); - - // Prepare GEMM prepare and release unused weights - if(!_is_quantized_asymmetric) - { - _mm_gemm->prepare(gemm_pack); - } - else - { - _mm_gemmlowp->prepare(gemm_pack); - } - - _is_prepared = true; - } -} - -experimental::MemoryRequirements CpuFullyConnected::workspace() const -{ - return _aux_mem; -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuFullyConnected.h b/src/runtime/cpu/operators/CpuFullyConnected.h deleted file mode 100644 index 498ceae68d..0000000000 --- a/src/runtime/cpu/operators/CpuFullyConnected.h +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_FULLY_CONNECTED_H -#define ARM_COMPUTE_CPU_FULLY_CONNECTED_H - -#include "src/runtime/cpu/ICpuOperator.h" - -#include "arm_compute/core/TensorInfo.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -// Forward declarations -class CpuConvertFullyConnectedWeights; -class CpuFlatten; -class CpuGemm; -class CpuGemmLowpMatrixMultiplyCore; -namespace kernels -{ -class CpuTransposeKernel; -} // namespace kernels -/** Basic function to compute a Fully Connected layer. This function calls the following kernels: - * -# @ref kernels::CpuIm2ColKernel (called when the input comes from a convolutional layer) - * -# @ref kernels::CpuTransposeKernel (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once) - * -# @ref CpuGemm or @ref CpuGemmLowpMatrixMultiplyCore (if quantized asymmetric) - * -# @ref kernels::CpuGemmMatrixAdditionKernel or @ref CpuGemmLowpOutputStage (if quantized asymmetric) (if @p biases is not equal to nullptr) - * - * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. - */ -class CpuFullyConnected : public ICpuOperator -{ -public: - /** Constructor */ - CpuFullyConnected(); - /** Destructor */ - ~CpuFullyConnected(); - /** Set the input and output tensors. - * - * Valid data layouts: - * - NHWC - * - NCHW - * - * Valid data type configurations: - * |src0 |src1 |src2 |dst | - * |:--------------|:------------------|:------|:--------------| - * |F16 |F16 |F16 |F16 | - * |F32 |F32 |F32 |F32 | - * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | - * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | - * - * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[in] weights Weights tensor info. The weights must be 2 dimensional. - * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions. - * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension. - * Data type supported: Same as @p src. - * @param[in] biases Bias tensor info. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED. - * @param[out] dst Destination tensor info. Its shape should be equal to the output of a matrix multiplication between: - * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer - * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer. - * Data type supported: Same as @p src. - * @param[in] fc_info (Optional) Fully connected layer additional info - */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, - FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref CpuFullyConnected - * - * Similar to @ref CpuFullyConnected - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); - - //Inherited methods override - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; - experimental::MemoryRequirements workspace() const override; - -private: - void configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act); - void configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act); - void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act); - - enum AuxTensorIdx - { - AsmGemmWorkspace = 0, - Pretranspose, - GemmTemp1, // Both CpuGemm and CpuGemmLowpMatrixMultiplyCore - GemmTemp2, // Both CpuGemm and CpuGemmLowpMatrixMultiplyCore - GemmTemp3, // Both CpuGemm and CpuGemmLowpMatrixMultiplyCore - GemmTemp4, // CpuGemmLowpMatrixMultiplyCore only - GemmTemp5, // CpuGemmLowpMatrixMultiplyCore only - GemmTemp6, // CpuGemmLowpMatrixMultiplyCore only - GemmTemp7, // CpuGemmLowpMatrixMultiplyCore only - TransposedWeights, - ConvertedWeights, - FlattenedSrc, - Count - }; - - std::unique_ptr _flatten; - std::unique_ptr _convert_weights; - std::unique_ptr _transpose_weights; - std::unique_ptr _mm_gemm; - std::unique_ptr _mm_gemmlowp; - - TensorInfo _flattened_src; - TensorInfo _converted_weights; - TensorInfo _reshaped_weights; - TensorInfo _trans_weights; - AuxTensorIdx _trans_weights_idx; - - experimental::MemoryRequirements _aux_mem; - - bool _needs_weights_conversion; - bool _needs_weights_reshape; - bool _is_fc_after_conv; - bool _is_quantized_asymmetric; - bool _is_prepared; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_FULLY_CONNECTED_H */ diff --git a/src/runtime/cpu/operators/CpuGemm.cpp b/src/runtime/cpu/operators/CpuGemm.cpp deleted file mode 100644 index bd3f231001..0000000000 --- a/src/runtime/cpu/operators/CpuGemm.cpp +++ /dev/null @@ -1,367 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuGemm.h" - -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/CPP/Validate.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/MemoryHelpers.h" -#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h" - -using namespace arm_compute::experimental; -using namespace arm_compute::misc::shape_calculator; - -namespace arm_compute -{ -namespace cpu -{ -namespace -{ -cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) -{ - cpu::AsmGemmInfo asm_info; - asm_info.method = cpu::AsmConvMethod::Im2Col; - asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d(); - asm_info.depth_output_gemm3d = info.depth_output_gemm3d(); - asm_info.activation_info = info.activation_info(); - asm_info.fast_mode = info.fast_math(); - - return asm_info; -} -} // namespace - -void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); - ARM_COMPUTE_ERROR_THROW_ON(CpuGemm::validate(a, b, c, d, alpha, beta, gemm_info)); - - const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); - const bool is_c_bias = gemm_info.reshape_b_only_on_first_run(); - bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)); - - // Check if we need to reshape the matrix B only on the first run - _is_prepared = false; - _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); - _run_vector_matrix_multiplication = a->dimension(1) < 2; - _run_alpha_scale = alpha != 1.f; - _run_bias_addition = c != nullptr && gemm_info.reshape_b_only_on_first_run(); - _run_addition = beta != 0 && c != nullptr && !gemm_info.reshape_b_only_on_first_run(); - _run_activation = gemm_info.activation_info().enabled() && (!run_optimised || (run_optimised - && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info()))); - - if(run_optimised) - { - const ITensorInfo *c_to_use = is_c_bias ? c : nullptr; - _asm_glue = std::make_unique(); - _asm_glue->configure(a, b, c_to_use, d, asm_info); - ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured()); - - auto asm_mem_req = _asm_glue->workspace(); - _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace]; - _aux_mem[Pretraspose] = asm_mem_req[Pretraspose]; - - // Scale product by alpha - if(_run_alpha_scale) - { - _alpha_scale_func = std::make_unique(); - _alpha_scale_func->configure(d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f)); - } - } - else - { - // Pick output tensor in case bias addition should be performed - ITensorInfo *gemm_output_to_use = (_run_bias_addition) ? &_tmp_d : d; - - _mm_kernel = std::make_unique(); - - // Select between GEMV and GEMM - if(_run_vector_matrix_multiplication) - { - // Configure the matrix multiply kernel - _mm_kernel->configure(a, b, gemm_output_to_use, alpha, false); - } - else - { - const int m = a->dimension(1); - const int n = b->dimension(0); - const int k = a->dimension(0); - - // Configure interleave kernel - _interleave_kernel = std::make_unique(); - _interleave_kernel->configure(a, &_tmp_a); - _aux_mem[InterleavedLHS] = MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size()); - - // Configure transpose kernel - _transpose_kernel = std::make_unique(); - _transpose_kernel->configure(b, &_tmp_b); - _aux_mem[TransposedRHS] = MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size()); - - // Configure matrix multiplication kernel - _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k)); - } - - if(_run_bias_addition) - { - _add_bias = std::make_unique(); - _add_bias->configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE); - _aux_mem[TempResult] = MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size()); - } - } - - // Configure matrix addition kernel - if(_run_addition) - { - _ma_kernel = std::make_unique(); - _ma_kernel->configure(c, d, beta); - } - - // Configure activation - if(_run_activation) - { - _activation_func = std::make_unique(); - _activation_func->configure(d, nullptr, gemm_info.activation_info()); - } -} - -Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_UNUSED(alpha); - const bool is_c_bias = gemm_info.reshape_b_only_on_first_run(); - - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); - if(a->data_type() != DataType::BFLOAT16) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, d); - } - - if(c != nullptr && !is_c_bias) - { - ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0); - ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, d); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), "The C matrix must have the same number of columns as the matrix B"); - } - - if(d->total_size() != 0) - { - ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != d->dimension(0)); - if(gemm_info.depth_output_gemm3d() != 0) - { - if(gemm_info.reinterpret_input_as_3d()) - { - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1)); - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != d->dimension(2)); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1) * d->dimension(2)); - } - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1)); - } - } - - // Check if we need to run the optimized assembly kernel - cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); - const bool run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)); - - if(!run_optimised) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "CpuGemm cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "CpuGemm cannot reinterpret the output tensor as 3D"); - - // Check if the first input tensor is a vector. - const bool run_vector_matrix_multiplication = a->dimension(1) < 2; - // Check if we need to reshape the matrix A and matrix B - const bool run_interleave_transpose = !run_vector_matrix_multiplication && !(gemm_info.reshape_b_only_on_first_run()); - - // Arguments used by GEMMReshapeInfo - // If we pass the matrix A and matrix B reshaped to CpuGemmMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to GEMMReshapeInfo - // in order to know how the matrices have been reshaped - const int m = a->dimension(1); - const int n = b->dimension(0); - const int k = a->dimension(0); - int mult_transpose1xW_width = 1; - int mult_interleave4x4_height = 1; - - const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d()); - - const ITensorInfo *matrix_a_info = a; - const ITensorInfo *matrix_b_info = b; - - TensorInfo tmp_a_info{}; - TensorInfo tmp_b_info{}; - TensorInfo tmp_output_info = *d->clone(); - - if(run_interleave_transpose) - { - matrix_a_info = &tmp_a_info; - matrix_b_info = &tmp_b_info; - - // Validate interleave kernel - auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmInterleave4x4Kernel::validate(a, &tmp_a_info)); - - // Validate transpose kernel - auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info)); - } - - // Validate matrix multiply - auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info))); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info)); - - if(c != nullptr && gemm_info.reshape_b_only_on_first_run()) - { - ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuAdd::validate(&tmp_output_info, c, d, ConvertPolicy::SATURATE)); - } - } - - // Validate matrix addition kernel - if(beta != 0 && c != nullptr && !is_c_bias) - { - ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixAdditionKernel::validate(c, d, beta)); - } - - // Validate activation - const ActivationLayerInfo &activation = gemm_info.activation_info(); - if(activation.enabled()) - { - ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuActivation::validate(d, nullptr, activation)); - } - - return Status{}; -} - -void CpuGemm::run(ITensorPack &tensors) -{ - prepare(tensors); - - auto a = tensors.get_const_tensor(ACL_SRC_0); - auto b = tensors.get_const_tensor(ACL_SRC_1); - auto c = tensors.get_const_tensor(ACL_SRC_2); - auto d = tensors.get_tensor(ACL_DST); - - if(_asm_glue->is_configured()) - { - // Pass c to asm dispatch only if it's the bias tensor - ITensorPack asm_pack = tensors; - asm_pack.add_const_tensor(ACL_SRC_2, (_reshape_b_only_on_first_run) ? c : nullptr); - _asm_glue->run(asm_pack); - if(_run_alpha_scale) - { - ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } }; - _alpha_scale_func->run(pack); - } - } - else - { - CpuAuxTensorHandler interleaved_a(offset_int_vec(InterleavedLHS), _tmp_a, tensors, true); - CpuAuxTensorHandler transposed_b(offset_int_vec(TransposedRHS), _tmp_b, tensors, true); - CpuAuxTensorHandler temp_d(offset_int_vec(TempResult), _tmp_d, tensors, true); - - ITensorPack mm_pack{ { ACL_SRC_0, a }, { ACL_SRC_1, b }, { ACL_DST, (_run_bias_addition) ? temp_d.get() : d } }; - if(!_run_vector_matrix_multiplication) - { - // Run interleave kernel - ITensorPack interleave_pack{ { ACL_SRC, a }, { ACL_DST, interleaved_a.get() } }; - NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(), interleave_pack); - - if(!_reshape_b_only_on_first_run) - { - // Run transpose kernel - ITensorPack transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } }; - NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack); - } - - // Use reshaped matrices - mm_pack.add_const_tensor(ACL_SRC_0, interleaved_a.get()); - mm_pack.add_const_tensor(ACL_SRC_1, transposed_b.get()); - } - - NEScheduler::get().schedule_op(_mm_kernel.get(), _run_vector_matrix_multiplication ? Window::DimX : Window::DimY, _mm_kernel->window(), mm_pack); - - // Run bias addition kernel - if(_run_bias_addition) - { - ITensorPack pack{ { ACL_SRC_0, temp_d.get() }, { ACL_SRC_1, c }, { ACL_DST, d } }; - _add_bias->run(pack); - } - } - - // Run matrix addition kernel - if(_run_addition) - { - ITensorPack c_add_pack{ { ACL_SRC, c }, { ACL_DST, d } }; - NEScheduler::get().schedule_op(_ma_kernel.get(), Window::DimY, _ma_kernel->window(), c_add_pack); - } - - // Run activation function - if(_run_activation) - { - ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } }; - _activation_func->run(pack); - } -} - -void CpuGemm::prepare(ITensorPack &tensors) -{ - if(!_is_prepared) - { - if(_asm_glue->is_configured()) - { - _asm_glue->prepare(tensors); - } - else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication) - { - const ITensor *b = tensors.get_const_tensor(ACL_SRC_1); - ITensor *b_aux = utils::cast::polymorphic_cast(tensors.get_tensor(offset_int_vec(TransposedRHS))); - ARM_COMPUTE_ERROR_ON_NULLPTR(b, b_aux); - - CpuAuxTensorHandler transposed_b(_tmp_b, *b_aux); - ITensorPack transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } }; - NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack); - } - _is_prepared = true; - } -} - -experimental::MemoryRequirements CpuGemm::workspace() const -{ - return _aux_mem; -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuGemm.h b/src/runtime/cpu/operators/CpuGemm.h deleted file mode 100644 index 8d859791f5..0000000000 --- a/src/runtime/cpu/operators/CpuGemm.h +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_GEMM_H -#define ARM_COMPUTE_CPU_GEMM_H - -#include "src/runtime/cpu/ICpuOperator.h" - -#include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h" -#include "src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.h" -#include "src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.h" -#include "src/core/cpu/kernels/CpuGemmTranspose1xWKernel.h" -#include "src/runtime/cpu/operators/CpuActivation.h" -#include "src/runtime/cpu/operators/CpuAdd.h" -#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to execute GEMM. This function calls the following kernels: - * - * If optimized assembly is available: - * -# @ref cpu::CpuGemmAssemblyDispatch - * -# @ref cpu::CpuActivation (if alpha != 1.0) - * Else: - * -# @ref cpu::kernels::CpuGemmInterleave4x4Kernel (if the output tensor is a matrix) - * -# @ref cpu::kernels::CpuGemmTranspose1xWKernel (if the output tensor is a matrix) - * -# @ref cpu::kernels::CpuGemmMatrixMultiplyKernel - * In both cases: - * -# @ref cpu::kernels::CpuGemmMatrixAdditionKernel (if c != nullptr and beta != 0.0 and is not reshaped once) - * Else: - * -# @ref cpu::CpuAdd (if c != nullptr and is reshaped once and not optimized assembly in place) - * - * -# @ref cpu::CpuActivation (if activation is specified in GEMMInfo) - */ -class CpuGemm : public ICpuOperator -{ -public: - /** Default constructor */ - CpuGemm() = default; - /** Default destructor */ - ~CpuGemm() = default; - /** Configure operator for a given list of arguments - * - * Valid data layouts: - * - All - * - * Valid data type configurations: - * |src0 |src1 |src2 |dst | - * |:------------|:-----------|:---------|:--------------| - * |F32 |F32 |F32 |F32 | - * |F16 |F16 |F16 |F16 | - * |BFLOAT16 |BFLOAT16 |BFLOAT16 |BFLOAT16 | - * - * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C]. - * @note GEMM: The tensors a, b, c, d must have the same data type. You should not mix data types when calling this function. - * - * @param[in] a First input tensor info (Matrix A or Vector A). Data type supported: BFLOAT16/F16/F32 - * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a - * @param[in] c Third input tensor info (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a - * @param[out] d Output tensor info. Data type supported: same as @p a - * @param[in] alpha Weight of the matrix product - * @param[in] beta Weight of matrix C - * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and - * if the reshape of matrix B should happen only for the first run - */ - void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, - float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo()); - /** Static function to check if given info will lead to a valid configuration of @ref CpuGemm. - * - * Similar to @ref CpuGemm::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, - float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo()); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; - experimental::MemoryRequirements workspace() const override; - -private: - enum AuxTensorIdx - { - AsmGemmWorkspace = 0, - Pretraspose, - InterleavedLHS, - TransposedRHS, - TempResult, - Count - }; - - std::unique_ptr _interleave_kernel{ nullptr }; - std::unique_ptr _transpose_kernel{ nullptr }; - std::unique_ptr _mm_kernel{ nullptr }; - std::unique_ptr _asm_glue{ nullptr }; - std::unique_ptr _ma_kernel{ nullptr }; - std::unique_ptr _alpha_scale_func{ nullptr }; - std::unique_ptr _add_bias{ nullptr }; - std::unique_ptr _activation_func{ nullptr }; - - TensorInfo _tmp_a{}; - TensorInfo _tmp_b{}; - TensorInfo _tmp_d{}; - - bool _run_vector_matrix_multiplication{ false }; - bool _run_alpha_scale{ false }; - bool _run_addition{ false }; - bool _run_bias_addition{ false }; - bool _run_activation{ false }; - bool _reshape_b_only_on_first_run{ false }; - bool _is_prepared{ false }; - - experimental::MemoryRequirements _aux_mem{ Count }; -}; -} // namespace cpu -} // namespace arm_compute -#endif /*ARM_COMPUTE_CPU_GEMM_H */ diff --git a/src/runtime/cpu/operators/CpuGemmConv2d.cpp b/src/runtime/cpu/operators/CpuGemmConv2d.cpp deleted file mode 100644 index a81dd8a661..0000000000 --- a/src/runtime/cpu/operators/CpuGemmConv2d.cpp +++ /dev/null @@ -1,612 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuGemmConv2d.h" - -#include "arm_compute/core/Size2D.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" - -#include "src/core/cpu/kernels/CpuCol2ImKernel.h" -#include "src/core/cpu/kernels/CpuIm2ColKernel.h" -#include "src/core/cpu/kernels/CpuReshapeKernel.h" -#include "src/core/cpu/kernels/CpuWeightsReshapeKernel.h" -#include "src/core/helpers/MemoryHelpers.h" -#include "src/runtime/cpu/operators/CpuGemm.h" -#include "src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h" -#include "src/runtime/cpu/operators/CpuGemmLowpOutputStage.h" -#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h" - -#include -#include - -using namespace arm_compute::misc::shape_calculator; -using namespace arm_compute::experimental; - -namespace arm_compute -{ -namespace cpu -{ -CpuGemmConv2d::CpuGemmConv2d() - : _weights_reshape_kernel(nullptr), _im2col_kernel(), _mm_gemm(), _mm_gemmlowp(), _col2im_kernel(), _reshape_kernel(), _im2col_output(), _weights_reshaped(), _gemm_output(), _gemm_output_3d(), - _data_layout(DataLayout::NCHW), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count) -{ -} -CpuGemmConv2d::~CpuGemmConv2d() = default; - -void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act_info, - bool enable_fast_math, int gemm_3d_depth) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights); - ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth, _skip_im2col)); - - // Create GEMMInfo structure - const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, - gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, - false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info); - - // Supported activations in GEMM - const std::set supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; - - if(_is_quantized) - { - TensorInfo tmp_src{ *src }; - TensorInfo tmp_weights{ *weights }; - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo iqinfo = src->quantization_info(); - const QuantizationInfo wqinfo = weights->quantization_info(); - const QuantizationInfo oqinfo = (dst->total_size() == 0) ? iqinfo : dst->quantization_info(); - const UniformQuantizationInfo uiqinfo = iqinfo.uniform(); - const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); - const DataType data_type = src->data_type(); - - tmp_src.set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset)); - if(!is_data_type_quantized_per_channel(tmp_weights.data_type())) - { - const UniformQuantizationInfo uwqinfo = wqinfo.uniform(); - tmp_weights.set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset)); - } - - // Merge activation with output stage - PixelValue type_min{}; - PixelValue type_max{}; - std::tie(type_min, type_max) = get_min_max(data_type); - int32_t min_activation = type_min.get(); - int32_t max_activation = type_max.get(); - - if(supported_acts.count(act_info.activation()) != 0) - { - std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo); - } - - GEMMLowpOutputStageInfo output_info; - output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - output_info.gemmlowp_offset = uoqinfo.offset; - output_info.gemmlowp_min_bound = min_activation; - output_info.gemmlowp_max_bound = max_activation; - output_info.is_quantized_per_channel = (tmp_weights.data_type() == DataType::QSYMM8_PER_CHANNEL); - quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info); - - _mm_gemmlowp = std::make_unique(); - _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, enable_fast_math, false, act_info)); - - auto mm_mem_req = _mm_gemmlowp->workspace(); - for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) - { - _aux_mem[cont] = mm_mem_req[cont]; - } - } - else - { - // Configure matrix multiply function - _mm_gemm = std::make_unique(); - _mm_gemm->configure(src, weights, biases, dst, 1.0f, 0.0f, gemm_info); - auto mm_mem_req = _mm_gemm->workspace(); - for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) - { - _aux_mem[cont] = mm_mem_req[cont]; - } - } -} - -Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const ActivationLayerInfo &act_info, bool enable_fast_math, int gemm_3d_depth, bool skip_im2col) -{ - const DataType data_type = src->data_type(); - const bool is_quantized = is_data_type_quantized_asymmetric(data_type); - const bool is_activation_enabled = act_info.enabled(); - - // Create GEMMInfo structure - const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, - gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, - false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info); - - if(is_quantized) - { - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo &iqinfo = src->quantization_info(); - const QuantizationInfo &wqinfo = weights->quantization_info(); - const QuantizationInfo &oqinfo = (dst->total_size() == 0) ? iqinfo : dst->quantization_info(); - const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); - - // Merge activation with output stage - PixelValue type_min{}; - PixelValue type_max{}; - std::tie(type_min, type_max) = get_min_max(data_type); - int32_t min_activation = type_min.get(); - int32_t max_activation = type_max.get(); - - const std::set supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; - if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0) - { - std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo); - } - - GEMMLowpOutputStageInfo output_info; - output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - output_info.gemmlowp_offset = uoqinfo.offset; - output_info.gemmlowp_min_bound = min_activation; - output_info.gemmlowp_max_bound = max_activation; - output_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL); - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info)); - - // Perform validation step on GEMMLowp - std::unique_ptr input_qa = src->clone(); - std::unique_ptr weights_qa = weights->clone(); - input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset)); - weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset)); - return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, - false, enable_fast_math, false, act_info)); - } - else - { - // Perform validation step on Matrix multiply function - return CpuGemm::validate(src, weights, nullptr, dst, 1.0f, 0.0f, gemm_info); - } -} - -Status CpuGemmConv2d::validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col) -{ - const DataType data_type = input_info->data_type(); - const unsigned int mult_y = skip_im2col ? 1U : gemm_3d_depth; - const unsigned int mult_z = skip_im2col ? gemm_3d_depth : 1U; - - // Set dummy tensor shapes for the validation - const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info()); - const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info()); - const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info()); - - return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, false, gemm_3d_depth, skip_im2col); -} - -void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info, - const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_UNUSED(num_groups, weights_info); - ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConv2d::validate(src, - weights, - biases, - dst, - conv_info, - weights_info, - dilation, - act_info, - enable_fast_math, - num_groups)); - - const DataType data_type = src->data_type(); - const DataLayout data_layout = src->data_layout(); - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); - - const unsigned int kernel_width = weights->dimension(idx_width); - const unsigned int kernel_height = weights->dimension(idx_height); - - _is_prepared = weights_info.retain_internal_weights(); - _is_quantized = is_data_type_quantized_asymmetric(src->data_type()); - _data_layout = data_layout; - _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); - - const ITensorInfo *gemm_input_to_use = src; - ITensorInfo *gemm_output_to_use = dst; - - // Get convolved dimensions - unsigned int conv_w = 0; - unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), - src->dimension(idx_height), - kernel_width, - kernel_height, - conv_info, - dilation); - ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h), - "Output shape does not match the expected one"); - - // Check if GEMM3D is supported - if(data_layout == DataLayout::NHWC) - { - _skip_col2im = bool(validate_gemm3d(src, weights, act_info, conv_h, true)); - // If not supported, we need to perform im2col and col2im (or reshape layer) - if(!_skip_col2im) - { - _skip_im2col = false; - } - } - else - { - _skip_col2im = false; - } - - // Get parameters from conv_info - unsigned int stride_x = 0; - unsigned int stride_y = 0; - std::tie(stride_x, stride_y) = conv_info.stride(); - - unsigned int mat_weights_cols = weights->dimension(idx_kernels); - - // _weights_reshaped will be auto configured in the kernel. - // Just append biases and do not transpose 1xW as it will be reshaped in CpuGemm - _weights_reshape_kernel = std::make_unique(); - _weights_reshape_kernel->configure(weights, nullptr, &_weights_reshaped); - _weights_reshaped.set_quantization_info(weights->quantization_info()); - - // Create tensor to store im2col reshaped inputs - if(!_skip_im2col) - { - // Configure - _im2col_kernel = std::make_unique(); - _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation); - - // Update GEMM input - gemm_input_to_use = &_im2col_output; - } - - // Create temporary GEMM output tensor in case we cannot skip col2im - const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type; - if(!_skip_col2im) - { - TensorShape shape_gemm; - - // Calculate GEMM output shape - shape_gemm = _im2col_output.tensor_shape(); - shape_gemm.set(0, mat_weights_cols); - shape_gemm.set(1, conv_w * conv_h); - - _gemm_output = TensorInfo(shape_gemm, 1, output_data_type); - _gemm_output.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout()); - _gemm_output_3d = TensorInfo(_gemm_output); - - // Update GEMM output - gemm_output_to_use = &_gemm_output; - } - else - { - _gemm_output_3d = TensorInfo(*dst); - _gemm_output_3d.set_data_type(output_data_type).set_data_layout(src->data_layout()).set_is_resizable(true); - _gemm_output = TensorInfo(_gemm_output_3d); - - // Update GEMM output - gemm_output_to_use = &_gemm_output_3d; - } - - // Configure GEMM - // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix - const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0; - configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math, gemm_3d_depth); - - if(!_skip_col2im && _data_layout == DataLayout::NCHW) - { - // Configure col2im - _col2im_kernel = std::make_unique(); - _col2im_kernel->configure(gemm_output_to_use, dst, Size2D(conv_w, conv_h)); - } - else - { - // Configure reshape layer - _reshape_kernel = std::make_unique(); - _reshape_kernel->configure(gemm_output_to_use, dst); - } - - // Check if GEMM transforms weights - // Modernise through COMPMID-4535 - bool gemm_trans_wei = _aux_mem[1].size > 0; // Asm Pretranspose - gemm_trans_wei = _mm_gemm != nullptr ? _aux_mem[3].size > 0 : gemm_trans_wei; // Tranpose RHS - gemm_trans_wei = _mm_gemmlowp != nullptr ? _aux_mem[5].size > 0 : gemm_trans_wei; // Transpose RHS - - // Check lifetime - _aux_mem[Im2ColOutput] = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size()); - _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, _weights_reshaped.total_size()); - _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size()); -} - -Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!"); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported"); - - const DataLayout data_layout = src->data_layout(); - const DataType data_type = src->data_type(); - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); - - const unsigned int kernel_width = weights->dimension(idx_width); - const unsigned int kernel_height = weights->dimension(idx_height); - - TensorInfo im2col_reshaped_info{}; - TensorInfo info_gemm{}; - TensorInfo tmp_info{}; - TensorInfo weights_reshaped_info{}; - const ITensorInfo *gemm_input_to_use = src; - const ITensorInfo *gemm_output_to_use = dst; - const ITensorInfo *weights_to_use = weights; - - const bool append_bias = false; - const bool is_quantized = is_data_type_quantized_asymmetric(data_type); - const bool is_bf16 = data_type == DataType::BFLOAT16; - bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1); - - // Get convolved dimensions - unsigned int conv_w = 0; - unsigned int conv_h = 0; - - std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), - src->dimension(idx_height), - kernel_width, - kernel_height, - conv_info, - dilation); - - // Check if GEMM3D is supported - bool skip_col2im = false; - if(data_layout == DataLayout::NHWC) - { - skip_col2im = bool(validate_gemm3d(src, weights, act_info, conv_h, true)); - // If not supported, we need to perform im2col and col2im (or reshape layer) - if(!skip_col2im) - { - skip_im2col = false; - } - } - - if(skip_col2im) - { - // If not supported, we need to perform im2col and col2im (or reshape layer) - if(!bool(validate_gemm3d(src, weights, act_info, conv_h, skip_im2col))) - { - skip_im2col = false; - skip_col2im = false; - } - } - - ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != src->dimension(idx_channel)); - ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); - - // Validate biases - if(biases != nullptr) - { - if(is_quantized) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); - } - else if(is_bf16) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); - } - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels)); - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - } - - unsigned int mat_weights_cols = weights->dimension(idx_kernels); - unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel); - - weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, data_type); - weights_reshaped_info.set_quantization_info(weights->quantization_info()); - weights_to_use = &weights_reshaped_info; - - if(!skip_im2col) - { - // Create tensor info for im2col reshaped inputs - // For CPU, the batch size is on the fourth dimension - TensorShape shape_im2col = src->tensor_shape(); - shape_im2col.set(0, mat_weights_rows); - shape_im2col.set(1, conv_w * conv_h); - shape_im2col.set(2, 1); - - im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type); - im2col_reshaped_info.set_quantization_info(src->quantization_info()); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation)); - gemm_input_to_use = &im2col_reshaped_info; - } - - // Create temporary GEMM output tensor in case we cannot skip col2im - const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type; - if(!skip_col2im) - { - TensorShape shape_gemm = gemm_input_to_use->tensor_shape(); - shape_gemm.set(0, mat_weights_cols); - shape_gemm.set(1, conv_w * conv_h); - info_gemm = TensorInfo(shape_gemm, 1, output_data_type); - } - else - { - info_gemm = TensorInfo(dst->tensor_shape(), 1, output_data_type); - } - info_gemm.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout()); - gemm_output_to_use = &info_gemm; - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col)); - - // Validate Col2Im/ReshapeLayer - if(!skip_col2im && (data_layout == DataLayout::NCHW)) - { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h))); - } - - return Status{}; -} - -void CpuGemmConv2d::run(ITensorPack &tensors) -{ - prepare(tensors); - - auto src = tensors.get_const_tensor(ACL_SRC_0); - auto dst = tensors.get_tensor(ACL_DST); - auto gemm_input_to_use = src; - - CpuAuxTensorHandler im2col_output(offset_int_vec(Im2ColOutput), _im2col_output, tensors, false); - CpuAuxTensorHandler gemm_output(offset_int_vec(GemmOutput), _gemm_output, tensors, false); - CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false); - - bool out_has_padding = _skip_col2im && (dst->info()->padding().bottom != 0 || dst->info()->padding().top != 0); - if(!_skip_im2col) - { - // Run input reshaping - unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - ITensorPack pack = - { - { TensorType::ACL_SRC, src }, - { TensorType::ACL_DST, im2col_output.get() } - }; - NEScheduler::get().schedule_op(_im2col_kernel.get(), y_dim, _im2col_kernel->window(), pack); - gemm_input_to_use = im2col_output.get(); - } - - // Handle the case where output has top/bottom padding - const ITensor *out_to_use = out_has_padding ? gemm_output.get() : dst; - Tensor gemm3d; - _gemm_output_3d.extend_padding(out_to_use->info()->padding()); - gemm3d.allocator()->soft_init(_gemm_output_3d); - gemm3d.allocator()->import_memory(out_to_use->buffer()); - auto gemm_output_to_use = gemm_output.get(); - - if(_skip_im2col) - { - gemm_output_to_use = &gemm3d; - } - if(_skip_col2im && !out_has_padding) - { - gemm_output_to_use = dst; - } - - // Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions - ITensorPack pack_mm = tensors; - pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use); - pack_mm.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get()); - pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use); - if(_is_quantized) - { - // Run gemmlowp - _mm_gemmlowp->run(pack_mm); - } - else - { - // Run gemm - _mm_gemm->run(pack_mm); - } - - // Reshape output matrix - if(!_skip_col2im) - { - if(_data_layout == DataLayout::NCHW) - { - ITensorPack pack = - { - { TensorType::ACL_SRC, gemm_output.get() }, - { TensorType::ACL_DST, dst } - }; - NEScheduler::get().schedule_op(_col2im_kernel.get(), Window::DimY, _col2im_kernel->window(), pack); - } - else - { - ITensorPack pack = - { - { TensorType::ACL_SRC, gemm_output_to_use }, - { TensorType::ACL_DST, dst } - }; - NEScheduler::get().schedule_op(_reshape_kernel.get(), Window::DimY, _reshape_kernel->window(), pack); - } - } - else if(out_has_padding) - { - ITensorPack pack = - { - { TensorType::ACL_SRC, gemm_output_to_use }, - { TensorType::ACL_DST, dst } - }; - NEScheduler::get().schedule_op(_reshape_kernel.get(), Window::DimY, _reshape_kernel->window(), pack); - } -} - -void CpuGemmConv2d::prepare(ITensorPack &tensors) -{ - if(!_is_prepared) - { - // Run weights reshaping and mark original weights tensor as unused - CpuAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors); - auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - ITensorPack pack = - { - { TensorType::ACL_SRC, weights }, - { TensorType::ACL_DST, weights_reshaped.get() } - }; - NEScheduler::get().schedule_op(_weights_reshape_kernel.get(), 3, _weights_reshape_kernel->window(), pack); - weights->mark_as_unused(); - - // Prepare GEMM - ITensorPack gemm_pack = tensors; - gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get()); - _is_quantized ? _mm_gemmlowp->prepare(gemm_pack) : _mm_gemm->prepare(gemm_pack); - - _is_prepared = true; - } -} -experimental::MemoryRequirements CpuGemmConv2d::workspace() const -{ - return _aux_mem; -} -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuGemmConv2d.h b/src/runtime/cpu/operators/CpuGemmConv2d.h deleted file mode 100644 index 529256594f..0000000000 --- a/src/runtime/cpu/operators/CpuGemmConv2d.h +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_GEMM_CONV2D_H -#define ARM_COMPUTE_CPU_GEMM_CONV2D_H - -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" -#include "src/runtime/cpu/ICpuOperator.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -class CpuGemm; -class CpuGemmLowpMatrixMultiplyCore; -class CpuGemmLowpOutputStage; -namespace kernels -{ -class CpuWeightsReshapeKernel; -class CpuIm2ColKernel; -class CpuCol2ImKernel; -class CpuReshapeKernel; -} // namespace kernels - -/** Basic function to compute the convolution layer. This function calls the following kernels/functions: - * - * -# @ref cpu::kernels::CpuIm2ColKernel - * -# @ref CpuGemm (if the data type is BFLOAT16/FP16/FP32) - * -# @ref CpuGemmLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED) - * -# @ref CpuGemmLowpOutputStage (if the data type is QASYMM8/QASYMM8_SIGNED) - * -# @ref cpu::kernels::CpuCol2ImKernel (if NCHW data layout) - * -# @ref kernels::CpuWeightsReshapeKernel - * - */ -class CpuGemmConv2d : public ICpuOperator -{ -public: - /** Constructor */ - CpuGemmConv2d(); - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuGemmConv2d(const CpuGemmConv2d &) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - CpuGemmConv2d(CpuGemmConv2d &&) = delete; - /** Prevent instances of this class from being copied (As this class contains pointers) */ - CpuGemmConv2d &operator=(const CpuGemmConv2d &) = delete; - /** Prevent instances of this class from being moved (As this class contains non movable objects) */ - CpuGemmConv2d &operator=(CpuGemmConv2d &&) = delete; - /** Destructor */ - ~CpuGemmConv2d(); - /** Set the input and output tensors. - * - * Valid data layouts: - * - NHWC - * - NCHW - * - * Valid data type configurations: - * |src0 |src1 |src2 |dst | - * |:--------------|:------------------|:--------|:--------------| - * |F16 |F16 |F16 |F16 | - * |F32 |F32 |F32 |F32 | - * |BFLOAT16 |BFLOAT16 |BFLOAT16 |BFLOAT16 | - * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | - * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | - * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | - * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | - * - * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. - * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. - * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. - * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. - * @param[out] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. - * Data types supported: Same as @p input. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. - * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. - * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation - * available which may introduce a drop of accuracy as well. Default is false - * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported - */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(), - const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuGemmConvolution::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, - const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), - bool enable_fast_math = false, unsigned int num_groups = 1); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; - experimental::MemoryRequirements workspace() const override; - -private: - /** Configures the appropriate matrix multiply routine - * - * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. - * @param[in] weights Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. - * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. - * @param[out] dst Output tensor info. Data types supported: Same as @p input, - * except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. - * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation - * available which may introduce a drop of accuracy as well. Default is false - * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1) - */ - void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(), - bool enable_fast_math = false, int gemm_3d_depth = 1); - /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines - * - * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. - * @param[in] weights Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. - * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. - * @param[in] dst Output tensor info. Data types supported: Same as @p input, - * except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. - * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation - * available which may introduce a drop of accuracy as well. Default is false - * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1) - * @param[in] skip_im2col (Optional) Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout. (Default to false) - * - * @return a status - */ - static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo(), - bool enable_fast_math = false, int gemm_3d_depth = 1, bool skip_im2col = false); - /** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref CpuGemmMLowpMatrixMultiplyCore - * - * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. - * @param[in] weights Weights tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. - * @param[in] act_info Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. - * @param[in] gemm_3d_depth Depth of GEMM 3D - * @param[in] skip_im2col Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout - * - * @return a status - */ - static Status validate_gemm3d(const ITensorInfo *src, const ITensorInfo *weights, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col); - - enum AuxTensorIdx - { - // CpuGemmLowpMatrixMultiplyCore has up to 8 internal tensors - Im2ColOutput = 9, - WeightsReshaped, - GemmOutput, - Count - }; - - std::unique_ptr _weights_reshape_kernel; - std::unique_ptr _im2col_kernel; - std::unique_ptr _mm_gemm; - std::unique_ptr _mm_gemmlowp; - std::unique_ptr _col2im_kernel; - std::unique_ptr _reshape_kernel; - - TensorInfo _im2col_output; - TensorInfo _weights_reshaped; - TensorInfo _gemm_output; - TensorInfo _gemm_output_3d; - - DataLayout _data_layout; - - bool _skip_im2col; - bool _skip_col2im; - bool _is_quantized; - bool _is_prepared; - - experimental::MemoryRequirements _aux_mem{ Count }; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_GEMM_CONV2D_H */ diff --git a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp b/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp deleted file mode 100644 index 10eece99eb..0000000000 --- a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp +++ /dev/null @@ -1,222 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuGemmDirectConv2d.h" - -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "arm_compute/runtime/FunctionDescriptors.h" -#include "src/core/helpers/MemoryHelpers.h" -#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h" - -#include "support/Cast.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -using namespace arm_compute::experimental; -using namespace arm_compute::utils::cast; - -namespace -{ -GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act) -{ - // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() - // Extract and negate input and weights offset - const QuantizationInfo iqinfo = src->quantization_info(); - const QuantizationInfo wqinfo = weights->quantization_info(); - const QuantizationInfo oqinfo = (dst->total_size() == 0) ? iqinfo : dst->quantization_info(); - const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); - const DataType data_type = src->data_type(); - // Merge activation with output stage - const std::set supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; - PixelValue type_min{}; - PixelValue type_max{}; - std::tie(type_min, type_max) = get_min_max(data_type); - int32_t min_activation = type_min.get(); - int32_t max_activation = type_max.get(); - if(supported_acts.count(act.activation()) != 0) - { - std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo); - } - GEMMLowpOutputStageInfo os_info; - os_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; - os_info.gemmlowp_offset = uoqinfo.offset; - os_info.gemmlowp_min_bound = min_activation; - os_info.gemmlowp_max_bound = max_activation; - os_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL); - quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, os_info); - return os_info; -} -cpu::AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect) -{ - cpu::AsmGemmInfo asm_info; - asm_info.method = is_indirect ? cpu::AsmConvMethod::Indirect : cpu::AsmConvMethod::Conv; - asm_info.ps_info = info.conv_info; - asm_info.activation_info = info.act_info; - asm_info.depth_output_gemm3d = true; - asm_info.reinterpret_input_as_3d = true; - asm_info.padding_top = info.conv_info.pad_top(); - asm_info.padding_left = info.conv_info.pad_left(); - asm_info.padding_value = 0.f; - asm_info.negated_offsets = false; - asm_info.fast_mode = info.enable_fast_math; - return asm_info; -} -} // namespace - -CpuGemmDirectConv2d::CpuGemmDirectConv2d() - : _gemm_asm_func(std::make_unique()), - _activation_func(std::make_unique()), - _weights_permute_func(std::make_unique()), - _aux_mem(AuxTensorIdx::Count), - _perm_weights(), - _run_activation(false), - _is_prepared(false) -{ -} - -CpuGemmDirectConv2d::~CpuGemmDirectConv2d() = default; - -void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(CpuGemmDirectConv2d::validate(src, - weights, - biases != nullptr ? biases : nullptr, - dst, - info)); - _run_activation = info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info); - _is_prepared = false; - - _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{ 3, 0, 1, 2 }); - - // Configure assembly dispatch - cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false); - if(is_data_type_quantized(src->data_type())) - { - asm_info.output_stage = calculate_output_stage_metadata(src, weights, dst, info.act_info); - } - _gemm_asm_func->configure(src, &_perm_weights, biases, dst, asm_info); - - // Configure activation - if(_run_activation) - { - _activation_func->configure(dst, nullptr, info.act_info); - } - - // Add auxiliary memory requirements of the assembly dispatch - auto asm_mem_req = _gemm_asm_func->workspace(); - _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace]; - _aux_mem[Pretranspose] = asm_mem_req[Pretranspose]; - - if(_aux_mem[Pretranspose].size > 0) - { - // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch - _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size()); - } - else - { - _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size()); - } -} -Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.num_groups > 1, "Grouping (num_groups != 1) is not supported on Neon"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Data layout supported is NHWC"); - const DataType data_type = src->data_type(); - const TensorShape i_shape = src->tensor_shape(); - const TensorShape w_shape = weights->tensor_shape(); - ARM_COMPUTE_RETURN_ERROR_ON(w_shape[0] != i_shape[0]); - ARM_COMPUTE_RETURN_ERROR_ON(info.dilation != Size2D(1U, 1U)); - ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); - // Validate biases - if(biases != nullptr) - { - if(is_data_type_quantized_asymmetric(data_type)) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); - } - else if(data_type == DataType::BFLOAT16) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); - } - ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3)); - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - } - - cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false); - ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuGemmAssemblyDispatch::validate(src, weights, biases, dst, asm_info)); - return Status{}; -} -void CpuGemmDirectConv2d::run(ITensorPack &tensors) -{ - prepare(tensors); - - _gemm_asm_func->run(tensors); - if(_run_activation) - { - _activation_func->run(tensors); - } -} - -void CpuGemmDirectConv2d::prepare(ITensorPack &tensors) -{ - if(!_is_prepared) - { - const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1); - ITensor *weights_aux = utils::cast::polymorphic_cast(tensors.get_tensor(offset_int_vec(PermutedWeights))); - ARM_COMPUTE_ERROR_ON_NULLPTR(weights, weights_aux); - - CpuAuxTensorHandler permuted_weights(_perm_weights, *weights_aux); - ITensorPack permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } }; - _weights_permute_func->run(permute_tensors); - - tensors.add_const_tensor(ACL_SRC_1, permuted_weights.get()); - // Call prepare of assembly dispatch - _gemm_asm_func->prepare(tensors); - - _is_prepared = true; - } -} - -experimental::MemoryRequirements CpuGemmDirectConv2d::workspace() const -{ - return _aux_mem; -} -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuGemmDirectConv2d.h b/src/runtime/cpu/operators/CpuGemmDirectConv2d.h deleted file mode 100644 index 7fb20b3037..0000000000 --- a/src/runtime/cpu/operators/CpuGemmDirectConv2d.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H -#define ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H - -#include "arm_compute/core/TensorInfo.h" -#include "src/core/common/Macros.h" -#include "src/runtime/cpu/ICpuOperator.h" -#include "src/runtime/cpu/operators/CpuActivation.h" -#include "src/runtime/cpu/operators/CpuPermute.h" -#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h" - -namespace arm_compute -{ -// Forward declarations -class ITensor; -struct Conv2dInfo; -namespace cpu -{ -class CpuGemmDirectConv2d : public ICpuOperator -{ -public: - CpuGemmDirectConv2d(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmDirectConv2d); - ~CpuGemmDirectConv2d(); - /** Set the input and output tensors. - * - * Valid data layouts: - * - All - * - * Valid data type configurations: - * |src0 |src1 |src2 |dst | - * |:--------------|:--------------|:--------------|:--------------| - * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | - * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | - * |F16 |F16 |F16 |F16 | - * |F32 |F32 |F32 |F32 | - * |BFLOAT16 |BFLOAT16 |BFLOAT16 |BFLOAT16 | - * - * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. - * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. - * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. - * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. - * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. - * @param[in] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. - * Data types supported: Same as @p input. - * @param[in] info Contains padding and stride information described in @ref PadStrideInfo. - */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info); - /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmDirectConv2d - * - * Similar to CpuGemmDirectConv2d::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; - experimental::MemoryRequirements workspace() const override; - -private: - enum AuxTensorIdx - { - AsmGemmWorkspace = 0, - Pretranspose, - PermutedWeights, - Count - }; - - std::unique_ptr _gemm_asm_func; - std::unique_ptr _activation_func; - std::unique_ptr _weights_permute_func; - experimental::MemoryRequirements _aux_mem; - TensorInfo _perm_weights; - bool _run_activation; - bool _is_prepared; -}; -} // namespace cpu -} // namespace arm_compute - -#endif /* ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H */ diff --git a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp deleted file mode 100644 index 7affc3f506..0000000000 --- a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp +++ /dev/null @@ -1,711 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h" - -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/KernelDescriptors.h" -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "arm_compute/runtime/TensorAllocator.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/MemoryHelpers.h" - -#include "src/core/cpu/kernels/CpuConvertQuantizedSignednessKernel.h" -#include "src/core/cpu/kernels/CpuGemmInterleave4x4Kernel.h" -#include "src/core/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h" -#include "src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h" -#include "src/core/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h" -#include "src/core/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h" -#include "src/core/cpu/kernels/CpuGemmTranspose1xWKernel.h" -#include "src/runtime/cpu/operators/CpuActivation.h" -#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h" -#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h" - -using namespace arm_compute::misc::shape_calculator; -using namespace arm_compute::experimental; - -namespace arm_compute -{ -namespace cpu -{ -namespace -{ -cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) -{ - cpu::AsmGemmInfo asm_info; - asm_info.method = cpu::AsmConvMethod::Im2Col; - asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d(); - asm_info.depth_output_gemm3d = info.depth_output_gemm3d(); - asm_info.activation_info = info.activation_info(); - asm_info.output_stage = info.gemmlowp_output_stage(); - asm_info.fast_mode = info.fast_math(); - - return asm_info; -} -} // namespace - -CpuGemmLowpMatrixMultiplyCore::CpuGemmLowpMatrixMultiplyCore() - : _asm_glue(std::make_unique()), - _mm_kernel(), - _mtx_a_reshape_kernel(), - _mtx_b_reshape_kernel(), - _mtx_a_reduction_kernel(), - _mtx_b_reduction_kernel(), - _offset_contribution_kernel(), - _offset_contribution_output_stage_kernel(), - _activation_func(), - _convert_to_signed_asymm(), - _convert_from_signed_asymm(), - _vector_sum_col(), - _vector_sum_row(), - _tmp_a(), - _tmp_b(), - _mm_result_s32(), - _signed_a(), - _signed_output(), - _a_offset(0), - _b_offset(0), - _run_vector_matrix_multiplication(false), - _assembly_path(false), - _fused_assembly_path(false), - _reshape_b_only_on_first_run(false), - _is_prepared(false), - _fuse_output_stage(false), - _run_activation(false), - _flip_signedness(false), - _gemm_info(), - _aux_mem(Count) -{ -} -CpuGemmLowpMatrixMultiplyCore::~CpuGemmLowpMatrixMultiplyCore() = default; - -void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, dst); - ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, dst, gemm_info)); - - const ITensorInfo *matrix_a = a; - const ITensorInfo *matrix_b = b; - GEMMInfo info = gemm_info; - - // Set internal variables - _a_offset = a->quantization_info().uniform().offset; - _b_offset = b->quantization_info().uniform().offset; - _run_vector_matrix_multiplication = a->dimension(1) < 2; - _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run(); - _is_prepared = false; - _fused_assembly_path = false; - _flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run; - _gemm_info = gemm_info; - - _asm_glue = std::make_unique(); - - const ITensorInfo *a_to_use = a; - - // Convert to QASYMM8 -> QASYMM8_SIGNED and back - if(_flip_signedness) - { - const int32_t offset_correction = 128; - const DataType dt = DataType::QASYMM8_SIGNED; - const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform(); - - _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); - _convert_to_signed_asymm = std::make_unique(); - _convert_to_signed_asymm->configure(a_to_use, &_signed_a); - a_to_use = &_signed_a; - _a_offset = _signed_a.quantization_info().uniform().offset; - - const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); - _signed_output = dst->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)); - - // Output stage correction - GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage(); - output_stage_corr.gemmlowp_offset = _signed_output.quantization_info().uniform().offset; - output_stage_corr.gemmlowp_min_bound -= offset_correction; - output_stage_corr.gemmlowp_max_bound -= offset_correction; - info.set_gemmlowp_output_stage(output_stage_corr); - - // Update matrix a - matrix_a = &_signed_a; - } - - // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage - if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) - { - _fuse_output_stage = true; - _mm_result_s32 = TensorInfo(dst->tensor_shape(), 1, DataType::S32); - } - - // Initialize assembly kernel meta-data - const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); -#ifdef __aarch64__ - switch(a->data_type()) - { - case DataType::QASYMM8: - case DataType::QASYMM8_SIGNED: - case DataType::U8: - case DataType::S8: - { - if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) - { - auto c_info_to_use = c == nullptr ? nullptr : c; - _asm_glue->configure(a_to_use, b, c_info_to_use, dst, asm_info); - _fused_assembly_path = _asm_glue->is_configured(); - } - else - { - auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 : dst); - _asm_glue->configure(a_to_use, b, nullptr, output_to_use, asm_info); - } - _assembly_path = _asm_glue->is_configured(); - break; - } - default: - { - ARM_COMPUTE_ERROR("Datatype not supported"); - break; - } - } -#endif /* __aarch64__ */ - if(!(_assembly_path || _run_vector_matrix_multiplication)) - { - matrix_a = &_tmp_a; - matrix_b = &_tmp_b; - - // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] - _tmp_a = TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info()); - // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] - _tmp_b = TensorInfo(compute_transpose1xW_shape(*b), 1, b->data_type(), b->quantization_info()); - - // Configure interleave kernel - _mtx_a_reshape_kernel = std::make_unique(); - _mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a); - - // Configure transpose kernel - _mtx_b_reshape_kernel = std::make_unique(); - _mtx_b_reshape_kernel->configure(b, &_tmp_b); - } - - if(!_fused_assembly_path) - { - // Build reduction info - const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false); - - // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0) - { - _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); - - // Configure Matrix B reduction kernel - _mtx_b_reduction_kernel = std::make_unique(); - _mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info); - } - - // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) - { - _vector_sum_row = TensorInfo(compute_reductionB_shape(*a_to_use), 1, DataType::S32); - - // Configure matrix A reduction kernel - _mtx_a_reduction_kernel = std::make_unique(); - _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info); - } - - if(_fuse_output_stage) - { - // Configure matrix multiply kernel - if(!_assembly_path) - { - _mm_kernel = std::make_unique(); - _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32); - } - - _offset_contribution_output_stage_kernel = std::make_unique(); - _offset_contribution_output_stage_kernel->configure(&_mm_result_s32, - _a_offset == 0 ? nullptr : &_vector_sum_col, - _b_offset == 0 ? nullptr : &_vector_sum_row, c, - _flip_signedness ? &_signed_output : dst, - a->dimension(0), - _a_offset, _b_offset, info.gemmlowp_output_stage()); - - if(_flip_signedness) - { - _convert_from_signed_asymm = std::make_unique(); - _convert_from_signed_asymm->configure(&_signed_output, dst); - } - } - else - { - // Configure matrix multiply kernel - if(!_assembly_path) - { - _mm_kernel = std::make_unique(); - _mm_kernel->configure(matrix_a, matrix_b, dst); - } - // Configure offset contribution kernel - _offset_contribution_kernel = std::make_unique(); - _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0), - _a_offset, _b_offset); - } - } - // Configure activation - const ActivationLayerInfo &activation = gemm_info.activation_info(); - _run_activation = activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation)); - if(_run_activation) - { - _activation_func = std::make_unique(); - _activation_func->configure(dst, nullptr, activation); - } - - if(_assembly_path) - { - auto asm_mem_req = _asm_glue->workspace(); - _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace]; - _aux_mem[Pretranspose] = asm_mem_req[Pretranspose]; - } - - // Request memory for LHS and RHS reshape matrix - _aux_mem[VectorSumCol] = MemoryInfo(offset_int_vec(VectorSumCol), !_fused_assembly_path && _a_offset != 0 - && _reshape_b_only_on_first_run ? - MemoryLifetime::Persistent : - MemoryLifetime::Temporary, - _vector_sum_col.total_size()); - _aux_mem[VectorSumRow] = MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size()); - _aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size()); - _aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); - _aux_mem[MMResultS32] = MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size()); - _aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size()); - _aux_mem[SignedOutput] = MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size()); -} - -Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1), - "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); - - GEMMInfo info = gemm_info; - const ITensorInfo *matrix_a_info = a; - const ITensorInfo *matrix_b_info = b; - - const ITensorInfo *a_to_use = a; - - TensorInfo tmp_a_info{}; - TensorInfo tmp_b_info{}; - TensorInfo mm_result_s32_info{}; - - int32_t a_offset = a->quantization_info().uniform().offset; - int32_t b_offset = b->quantization_info().uniform().offset; - - bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE; - if(fuse_output_stage) - { - auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32)); - } - - // Convert QASYMM8->QASYMM8_SIGNED - TensorInfo signed_a{}; - TensorInfo signed_output{}; - bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run(); - if(flip_signedness) - { - const int32_t offset_correction = 128; - const DataType dt = DataType::QASYMM8_SIGNED; - const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform(); - - signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a)); - a_to_use = &signed_a; - a_offset = signed_a.quantization_info().uniform().offset; - - const UniformQuantizationInfo oqinfo = output->quantization_info().uniform(); - signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)); - - // Output stage correction - GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage(); - output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset; - output_stage_corr.gemmlowp_min_bound -= offset_correction; - output_stage_corr.gemmlowp_max_bound -= offset_correction; - info.set_gemmlowp_output_stage(output_stage_corr); - - // Update matrix a - matrix_a_info = &signed_a; - } - - // Initialize assembly kernel meta-data - const AsmGemmInfo asm_info = init_assembly_metadata(info); - - // Check if we need to run the optimized assembly kernel - bool run_optimised = false; - bool run_optimised_requantized = false; - if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) - { - run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info)); - run_optimised_requantized = run_optimised; - } - else - { - run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info)); - } - - if(run_optimised) - { - ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0)); - if(info.depth_output_gemm3d() != 0) - { - if(info.reinterpret_input_as_3d()) - { - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2)); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2)); - } - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); - } - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D"); - - const bool run_vector_matrix_multiplication = a->dimension(1) < 2; - if(!run_vector_matrix_multiplication) - { - matrix_a_info = &tmp_a_info; - matrix_b_info = &tmp_b_info; - - // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] - TensorShape shape_tmp_a = a->tensor_shape(); - shape_tmp_a.set(0, a->dimension(0) * 4); - shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f)); - - // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] - TensorShape shape_tmp_b = b->tensor_shape(); - shape_tmp_b.set(0, b->dimension(1) * 16); - shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f)); - - // Validate interleave kernel - auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a)); - auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b)); - - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmInterleave4x4Kernel::validate(a_to_use, &tmp_a_info)); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info)); - } - } - - if(!run_optimised_requantized) - { - TensorInfo info_vector_sum_col{}; - TensorInfo info_vector_sum_row{}; - - const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false); - - // Validate matrix B reduction kernel only if _a_offset is not equal to 0 - if(a_offset != 0) - { - info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); - - // Configure Matrix B reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info)); - } - - // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 - if(b_offset != 0) - { - info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); - - // Configure matrix A reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info)); - } - - if(fuse_output_stage) - { - if(!run_optimised) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D"); - - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info)); - } - - // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - c, - flip_signedness ? &signed_output : output, - a_offset, b_offset, - info.gemmlowp_output_stage())); - } - else - { - if(!run_optimised) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D"); - - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output)); - } - // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(output, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - a_offset, b_offset)); - } - } - - // Validate activation - const ActivationLayerInfo &activation = gemm_info.activation_info(); - if(activation.enabled()) - { - ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, activation)); - } - - return Status{}; -} - -void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) -{ - prepare(tensors); - - auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - auto a_to_use = a; - auto matrix_a = a; - auto matrix_b = b; - - CpuAuxTensorHandler vector_sum_col(offset_int_vec(VectorSumCol), _vector_sum_col, tensors, false); - CpuAuxTensorHandler vector_sum_row(offset_int_vec(VectorSumRow), _vector_sum_row, tensors, false); - CpuAuxTensorHandler tmp_a(offset_int_vec(TmpA), _tmp_a, tensors, false); - CpuAuxTensorHandler tmp_b(offset_int_vec(TmpB), _tmp_b, tensors, true); - CpuAuxTensorHandler mm_result_s32(offset_int_vec(MMResultS32), _mm_result_s32, tensors, false); - CpuAuxTensorHandler signed_a(offset_int_vec(SignedA), _signed_a, tensors, false); - CpuAuxTensorHandler signed_output(offset_int_vec(SignedOutput), _signed_output, tensors, false); - - // Convert QASYMM8->QASYMM8_SIGNED - if(_flip_signedness) - { - ITensorPack pack = - { - { TensorType::ACL_SRC, a }, - { TensorType::ACL_DST, signed_a.get() } - }; - NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(), pack); - a_to_use = signed_a.get(); - matrix_a = signed_a.get(); - } - - // Run GEMM - if(_asm_glue->is_configured()) - { - ITensorPack asm_glue_tensors = tensors; - auto output_to_use = (_fuse_output_stage ? mm_result_s32.get() : dst); - if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) - { - asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use); - asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b); - asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_2, c); - asm_glue_tensors.add_tensor(TensorType::ACL_DST, dst); - } - else - { - asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use); - asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b); - asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use); - } - _asm_glue->run(asm_glue_tensors); - } - else - { - if(!_run_vector_matrix_multiplication) - { - matrix_a = tmp_a.get(); - matrix_b = tmp_b.get(); - // Run interleave kernel - ITensorPack pack_a = - { - { TensorType::ACL_SRC, a_to_use }, - { TensorType::ACL_DST, tmp_a.get() } - }; - NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(), pack_a); - - if(!_reshape_b_only_on_first_run) - { - ITensorPack pack_b = - { - { TensorType::ACL_SRC, b }, - { TensorType::ACL_DST, tmp_b.get() } - }; - // Run transpose kernel - NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack_b); - } - } - ITensorPack pack_mm = - { - { TensorType::ACL_SRC_0, matrix_a }, - { TensorType::ACL_SRC_1, matrix_b } - }; - if(_fuse_output_stage) - { - pack_mm.add_tensor(TensorType::ACL_DST, mm_result_s32.get()); - } - else - { - pack_mm.add_tensor(TensorType::ACL_DST, dst); - } - NEScheduler::get().schedule_op(_mm_kernel.get(), Window::DimY, _mm_kernel->window(), pack_mm); - } - - if(!_fused_assembly_path) - { - // Run matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) - { - ITensorPack pack = - { - { TensorType::ACL_SRC, a_to_use }, - { TensorType::ACL_DST, vector_sum_row.get() } - }; - NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX, _mtx_a_reduction_kernel->window(), pack); - } - - // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0 && !_reshape_b_only_on_first_run) - { - ITensorPack pack = - { - { TensorType::ACL_SRC, b }, - { TensorType::ACL_DST, vector_sum_col.get() } - }; - NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack); - } - - if(_fuse_output_stage) - { - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC_0, mm_result_s32.get()); - pack.add_tensor(TensorType::ACL_SRC_1, _a_offset == 0 ? nullptr : vector_sum_col.get()); - pack.add_tensor(TensorType::ACL_SRC_2, _b_offset == 0 ? nullptr : vector_sum_row.get()); - pack.add_tensor(TensorType::ACL_SRC_3, c); - pack.add_tensor(TensorType::ACL_DST, _flip_signedness ? signed_output.get() : dst); - - // Run offset contribution kernel - NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY, _offset_contribution_output_stage_kernel->window(), pack); - } - else - { - ITensorPack pack; - pack.add_tensor(TensorType::ACL_SRC_0, _a_offset == 0 ? nullptr : vector_sum_col.get()); - pack.add_tensor(TensorType::ACL_SRC_1, _b_offset == 0 ? nullptr : vector_sum_row.get()); - pack.add_tensor(TensorType::ACL_DST, dst); - - // Run offset contribution kernel - NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY, _offset_contribution_kernel->window(), pack); - } - } - - // Convert QASYMM8_SIGNED->QASYMM8 - if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness) - { - ITensorPack pack = - { - { TensorType::ACL_SRC, signed_output.get() }, - { TensorType::ACL_DST, dst } - }; - NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY, _convert_from_signed_asymm->window(), pack); - } - - // Run fused activation unless already run in the fused assembly - if(_run_activation) - { - ITensorPack pack = - { - { TensorType::ACL_SRC, dst }, - { TensorType::ACL_DST, dst } - }; - _activation_func->run(pack); - } -} - -void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors) -{ - if(!_is_prepared) - { - auto original_b = tensors.get_const_tensor(TensorType::ACL_SRC_1); - // Run assembly reshape - if(_asm_glue->is_configured()) - { - _asm_glue->prepare(tensors); - } - // Run non-assembly reshape - else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured()) - { - // Run reshape kernel and mark original weights tensor as unused - ITensor *tmp_b_p = utils::cast::polymorphic_downcast(tensors.get_tensor(offset_int_vec(TmpB))); - CpuAuxTensorHandler tmp_b(_tmp_b, *tmp_b_p); - ITensorPack pack = - { - { TensorType::ACL_SRC, original_b }, - { TensorType::ACL_DST, tmp_b.get() } - }; - NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack); - } - - // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run) - { - ITensor *vector_sum_col_p = utils::cast::polymorphic_downcast(tensors.get_tensor(offset_int_vec(VectorSumCol))); - CpuAuxTensorHandler vector_sum_col(_vector_sum_col, *vector_sum_col_p); - ITensorPack pack = - { - { TensorType::ACL_SRC, original_b }, - { TensorType::ACL_DST, vector_sum_col.get() } - }; - NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack); - } - _is_prepared = true; - } -} -experimental::MemoryRequirements CpuGemmLowpMatrixMultiplyCore::workspace() const -{ - return _aux_mem; -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h deleted file mode 100644 index 1d0e470559..0000000000 --- a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h +++ /dev/null @@ -1,174 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_CORE_H -#define ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_CORE_H - -#include "arm_compute/core/TensorInfo.h" -#include "src/core/common/Macros.h" -#include "src/runtime/cpu/ICpuOperator.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -namespace kernels -{ -class CpuGemmInterleave4x4Kernel; -class CpuGemmLowpMatrixMultiplyKernel; -class CpuGemmLowpOffsetContributionKernel; -class CpuGemmLowpOffsetContributionOutputStageKernel; -class CpuGemmLowpMatrixAReductionKernel; -class CpuGemmLowpMatrixBReductionKernel; -class CpuGemmTranspose1xWKernel; -class CpuConvertQuantizedSignednessKernel; -} // namespace kernels -class CpuGemmAssemblyDispatch; -class CpuActivation; - -/** Basic function to execute GEMMLowpMatrixMultiplyCore. This function calls the following kernels if the DOT product instruction is not available: - * - * -# @ref kernels::CpuGemmInterleave4x4Kernel - * -# @ref kernels::CpuGemmTranspose1xWKernel - * -# @ref kernels::CpuGemmLowpMatrixMultiplyKernel - * -# @ref kernels::CpuGemmLowpOffsetContributionKernel - * -# @ref CpuActivation - * - * otherwise if the DOT product instruction is available: - * - * -# @ref kernels::CpuGemmLowpOffsetContributionKernel - * -*/ -class CpuGemmLowpMatrixMultiplyCore : public ICpuOperator -{ -public: - /** Constructor */ - CpuGemmLowpMatrixMultiplyCore(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixMultiplyCore); - /** Destructor */ - ~CpuGemmLowpMatrixMultiplyCore(); - /** Initialise the kernel's inputs, output - * - * Valid data layouts: - * - NHWC - * - NCHW - * - * Valid data type configurations: - * |src0 |src1 |src2 |dst | - * |:--------------|:------------------|:--------|:--------------| - * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | - * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | - * |QASYMM8 |QSYMM8 |S32 |QASYMM8 | - * |QASYMM8 |QASYMM8 |S32 |S32 | - * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |S32 | - * |QASYMM8 |QSYMM8 |S32 |S32 | - * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | - * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | - * |QASYMM8_SIGNED |QSYMM8 |S32 |QASYMM8_SIGNED | - * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |S32 | - * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |S32 | - * |QASYMM8_SIGNED |QSYMM8 |S32 |S32 | - * - * @note GEMM_LOWP: low precision GEMM kernel - * This kernel performs the following computations: - * - * -# Convert a values from QASYMM8 to int32 and add a_offset to each of them. - * -# Convert b values from QASYMM8 to int32 add b_offset to each of them. - * -# Compute the matrix product of the resulting a * b in int32. - * - * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise - * - * @param[in] a First input tensor info (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED. - * @param[in] b Second input tensor info (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL. - * @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type supported: S32 - * @param[out] dst Output tensor info. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED - * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and - * if the reshape of matrix B should be executed only for the first run - */ - void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo()); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuGemmLowpMatrixMultiplyCore::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo()); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; - experimental::MemoryRequirements workspace() const override; - -private: - enum AuxTensorIdx - { - AsmGemmWorkspace = 0, - Pretranspose, - VectorSumCol, - VectorSumRow, - TmpA, - TmpB, - MMResultS32, - SignedA, - SignedOutput, - Count - }; - - std::unique_ptr _asm_glue; - std::unique_ptr _mm_kernel; - std::unique_ptr _mtx_a_reshape_kernel; - std::unique_ptr _mtx_b_reshape_kernel; - std::unique_ptr _mtx_a_reduction_kernel; - std::unique_ptr _mtx_b_reduction_kernel; - std::unique_ptr _offset_contribution_kernel; - std::unique_ptr _offset_contribution_output_stage_kernel; - std::unique_ptr _activation_func; - std::unique_ptr _convert_to_signed_asymm; - std::unique_ptr _convert_from_signed_asymm; - - TensorInfo _vector_sum_col; - TensorInfo _vector_sum_row; - TensorInfo _tmp_a; - TensorInfo _tmp_b; - TensorInfo _mm_result_s32; - TensorInfo _signed_a; - TensorInfo _signed_output; - int32_t _a_offset; - int32_t _b_offset; - - bool _run_vector_matrix_multiplication; - bool _assembly_path; - bool _fused_assembly_path; - bool _reshape_b_only_on_first_run; - bool _is_prepared; - bool _fuse_output_stage; - bool _run_activation; - bool _flip_signedness; - GEMMInfo _gemm_info; - experimental::MemoryRequirements _aux_mem{}; -}; -} // namespace cpu -} // namespace arm_compute -#endif /*ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_CORE_H */ diff --git a/src/runtime/cpu/operators/CpuGemmLowpOutputStage.cpp b/src/runtime/cpu/operators/CpuGemmLowpOutputStage.cpp deleted file mode 100644 index e17f854a21..0000000000 --- a/src/runtime/cpu/operators/CpuGemmLowpOutputStage.cpp +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuGemmLowpOutputStage.h" - -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h" -#include "src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h" -#include "src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h" -#include "src/core/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info) -{ - // Perform validate step - ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpOutputStage::validate(src, bias, dst, info)); - - switch(info.type) - { - case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: - { - switch(info.output_data_type) - { - case DataType::QASYMM8: - { - auto k = std::make_unique(); - k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - _kernel = std::move(k); - break; - } - case DataType::QASYMM8_SIGNED: - { - auto k = std::make_unique(); - k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - _kernel = std::move(k); - break; - } - case DataType::QSYMM16: - { - auto k = std::make_unique(); - k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - _kernel = std::move(k); - break; - } - default: - { - ARM_COMPUTE_ERROR("Unsupported output data type."); - break; - } - } - break; - } - case GEMMLowpOutputStageType::QUANTIZE_DOWN: - { - switch(info.output_data_type) - { - case DataType::QASYMM8: - case DataType::QASYMM8_SIGNED: - { - auto k = std::make_unique(); - k->configure(src, bias, dst, &info); - _kernel = std::move(k); - break; - } - default: - { - ARM_COMPUTE_ERROR("Unsupported output data type."); - break; - } - } - break; - } - default: - ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type."); - } -} - -Status CpuGemmLowpOutputStage::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::UNKNOWN, "CpuGemmLowpOutputStage cannot be used with UNKNOWN output data type."); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16); - ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) && (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)); - - switch(info.type) - { - case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: - { - switch(dst->data_type()) - { - case DataType::QASYMM8: - return kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - case DataType::QASYMM8_SIGNED: - return kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - case DataType::QSYMM16: - return kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); - default: - return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type."); - } - } - case GEMMLowpOutputStageType::QUANTIZE_DOWN: - { - switch(dst->data_type()) - { - case DataType::QASYMM8: - case DataType::QASYMM8_SIGNED: - return kernels::CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(src, bias, dst, &info); - default: - return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type."); - } - } - default: - return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type."); - } -} - -void CpuGemmLowpOutputStage::run(ITensorPack &tensors) -{ - NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); -} -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuGemmLowpOutputStage.h b/src/runtime/cpu/operators/CpuGemmLowpOutputStage.h deleted file mode 100644 index bed88a60d5..0000000000 --- a/src/runtime/cpu/operators/CpuGemmLowpOutputStage.h +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H -#define ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H - -#include "arm_compute/core/Types.h" -#include "src/runtime/cpu/ICpuOperator.h" - -/** This file contains all available output stages for GEMMLowp. - * - * In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyCore), - * and processes it to obtain the final ASYMM8 value. - * - * More information about the GEMMLowp output stage can be found at https://github.com/google/gemmlowp/blob/master/doc/output.md - */ - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to execute GEMMLowpQuantizeDown kernels. - * - * This function calls the following kernels: - * - * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ScaleKernel - * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel - * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel - * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel -*/ -class CpuGemmLowpOutputStage : public ICpuOperator -{ -public: - /** Initialise the kernel's inputs, output - * - * Valid data layouts: - * - All - * - * Valid data type configurations: - * |src0 |src1 |dst | - * |:--------------|:-------------|:-------------| - * |S32 |S32 |QASYMM8 | - * |S32 |S32 |QASYMM8_SIGNED| - * |S32 |S32 |QSYMM16 | - * - * @param[in] src Input tensor info. Data type supported: S32 - * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required. - * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. - * @param[out] dst Output tensor info. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16 - * @param[in] info GEMMLowp output stage metadata. - */ - void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuGemmLowpOutputStage::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H */ diff --git a/src/runtime/cpu/operators/CpuMul.cpp b/src/runtime/cpu/operators/CpuMul.cpp deleted file mode 100644 index 2f3d442a70..0000000000 --- a/src/runtime/cpu/operators/CpuMul.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuMul.h" - -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/cpu/kernels/CpuMulKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -Status CpuMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, - const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); - return kernels::CpuMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy); -} - -void CpuMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, - const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(act_info); - auto k = std::make_unique(); - k->configure(src1, src2, dst, scale, overflow_policy, rounding_policy); - _kernel = std::move(k); -} - -void CpuMul::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); -} - -Status CpuComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); - return kernels::CpuComplexMulKernel::validate(src1, src2, dst); -} - -void CpuComplexMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(act_info); - auto k = std::make_unique(); - k->configure(src1, src2, dst); - _kernel = std::move(k); -} - -void CpuComplexMul::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); -} -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuMul.h b/src/runtime/cpu/operators/CpuMul.h deleted file mode 100644 index da518c4461..0000000000 --- a/src/runtime/cpu/operators/CpuMul.h +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (c) 2016-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_MUL_H -#define ARM_COMPUTE_CPU_MUL_H - -#include "arm_compute/core/ITensorInfo.h" -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuMulKernel */ -class CpuMul : public ICpuOperator -{ -public: - /** Initialise the kernel's inputs, dst and convertion policy. - * - * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. - * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. - * - * @param[in, out] src1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 - * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[in, out] src2 Second input tensor info. Data types supported: U8, QASYMM8 (only if @p src1 is QASYMM8), QASYMM8_SIGNED (only if @p src1 is QASYMM8_SIGNED), S16, S32, QSYMM16 (only if @p src1 is QSYMM16), F16 (only if @p src1 is F16), F32 (only if @p src1 is F32). - * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[out] dst dst tensor info. Data types supported: - * - U8, only if both inputs are U8. - * - QASYMM8, only if both inputs are QASYMM8. - * - QASYMM8_SIGNED, only if @p src1 is QASYMM8_SIGNED. - * - S16. - * - QSYMM16, only if both inputs are QSYMM16. - * - S32, only if both inputs are S32 or both are QSYMM16. - * - F16, only if @p src1 is F16. - * - F32, only if both inputs are F32. - * @param[in] scale Scale to apply after multiplication. - * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. - * If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255 - * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype - * @param[in] rounding_policy Rounding policy. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. - */ - void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuMul::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, - const ActivationLayerInfo &act_info = ActivationLayerInfo()); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; -}; - -/** Basic function to run @ref kernels::CpuComplexMulKernel */ -class CpuComplexMul : public ICpuOperator -{ -public: - /** Initialise the kernel's inputs, dst. - * - * @param[in, out] src1 First input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor). - * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[in, out] src2 Second input tensor. Data types supported: same as @p src1. Number of channels supported: same as @p src1. - * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. - * @param[out] dst The dst tensor. Data types supported: same as @p src1. Number of channels: same as @p src1. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. - */ - void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuComplexMul::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_MUL_H */ \ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuPRelu.h b/src/runtime/cpu/operators/CpuPRelu.h deleted file mode 100644 index a6859f95d9..0000000000 --- a/src/runtime/cpu/operators/CpuPRelu.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_PRELU_H -#define ARM_COMPUTE_CPU_PRELU_H - -#include "src/runtime/cpu/operators/CpuElementwise.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for PRelu operation */ -using CpuPRelu = CpuElementwiseArithmetic; -} // namespace cpu -} // namespace arm_compute - -#endif /* ARM_COMPUTE_CPU_PRELU_H */ \ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuPermute.cpp b/src/runtime/cpu/operators/CpuPermute.cpp deleted file mode 100644 index 7fde1e3767..0000000000 --- a/src/runtime/cpu/operators/CpuPermute.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuPermute.h" - -#include "src/core/cpu/kernels/CpuPermuteKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuPermute::configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm) -{ - auto k = std::make_unique(); - k->configure(src, dst, perm); - _kernel = std::move(k); -} - -Status CpuPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm) -{ - return kernels::CpuPermuteKernel::validate(src, dst, perm); -} -} // namesapce cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuPermute.h b/src/runtime/cpu/operators/CpuPermute.h deleted file mode 100644 index 2500017c0e..0000000000 --- a/src/runtime/cpu/operators/CpuPermute.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_PERMUTE_H -#define ARM_COMPUTE_CPU_PERMUTE_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuPermuteKernel */ -class CpuPermute : public ICpuOperator -{ -public: - /** Configure operator for a given list of arguments - * - * @note Arbitrary permutation vectors are supported with rank not greater than 4 - * - * @param[in] src Source tensor to permute. Data types supported: All - * @param[out] dst Destintation tensor. Data types supported: Same as @p src - * @param[in] perm Permutation vector - */ - void configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuPermute::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm); -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_PERMUTE_H */ diff --git a/src/runtime/cpu/operators/CpuPool2d.cpp b/src/runtime/cpu/operators/CpuPool2d.cpp deleted file mode 100644 index e746c8fb3b..0000000000 --- a/src/runtime/cpu/operators/CpuPool2d.cpp +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuPool2d.h" - -#include "arm_compute/core/ITensor.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/NEON/kernels/NEFillBorderKernel.h" -#include "src/core/cpu/kernels/CpuPool2dKernel.h" -#include "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h" - -using namespace arm_compute::experimental; - -namespace arm_compute -{ -namespace cpu -{ -CpuPool2d::CpuPool2d() - : _pooling_layer_kernel(), - _border_handler(), - _asm_glue(), - _is_global_pooling_layer(false), - _data_layout(DataLayout::NCHW), - _aux_mem(1) -{ -} - -CpuPool2d::~CpuPool2d() = default; - -void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices) -{ - // Check if we can run assembly kernels. Currently, indices are not supported by those kernels - const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); - - // Get data layout - _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; - - // Check if we have Global Pooling Layer - const unsigned int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const unsigned int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - _is_global_pooling_layer = (src->dimension(idx_width) == pool_info.pool_size.width) && (src->dimension(idx_height) == pool_info.pool_size.height); - - if(run_optimised) - { - const CPUInfo &ci = NEScheduler::get().cpu_info(); - const unsigned int num_threads = NEScheduler::get().num_threads(); - - auto pooling_wrapper = std::make_unique(); - ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr); - pooling_wrapper->configure(src, dst, pool_info, ci); - - // Get kernel's memory requirements - constexpr size_t alignment = 4096; - const size_t workspace_size = pooling_wrapper->get_working_size(num_threads); - _aux_mem[0] = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment); - - _asm_glue = std::move(pooling_wrapper); - } - else - { - // Configure pooling kernel - auto k = std::make_unique(); - k->configure(src, dst, pool_info, indices); - _pooling_layer_kernel = std::move(k); - - switch(_data_layout) - { - case DataLayout::NCHW: - { - // Configure border depending on operation required (quantize border in case of asymmetric data_type) - BorderMode border_mode = (!indices && pool_info.pool_type == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT; - PixelValue zero_value((indices) ? std::numeric_limits::min() : 0.f); - if(is_data_type_quantized_asymmetric(src->data_type()) && !pool_info.exclude_padding) - { - zero_value = PixelValue(0, src->data_type(), src->quantization_info()); - } - auto b = std::make_unique(); - b->configure(src, _pooling_layer_kernel->border_size(), border_mode, zero_value); - _border_handler = std::move(b); - break; - } - case DataLayout::NHWC: - break; - default: - ARM_COMPUTE_ERROR("Data layout not supported"); - } - } -} - -Status CpuPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) -{ - const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); - - if(run_optimised) - { - return Status{}; - } - - return kernels::CpuPool2dKernel::validate(src, dst, pool_info, indices); -} - -void CpuPool2d::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided"); - - if(_asm_glue) - { - const auto hints = (_is_global_pooling_layer) ? Window::DimX : Window::DimY; - NEScheduler::get().schedule_op(_asm_glue.get(), hints, _asm_glue->window(), tensors); - } - else - { - switch(_data_layout) - { - case DataLayout::NCHW: - // Fill border - NEScheduler::get().schedule_op(_border_handler.get(), Window::DimY, _border_handler->window(), tensors); - - // Run pooling layer - NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY, _pooling_layer_kernel->window(), tensors); - break; - case DataLayout::NHWC: - // Run pooling layer - NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), Window::DimX, _pooling_layer_kernel->window(), tensors); - break; - default: - ARM_COMPUTE_ERROR("Data layout not supported"); - } - } -} - -experimental::MemoryRequirements CpuPool2d::workspace() const -{ - return _aux_mem; -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuPool2d.h b/src/runtime/cpu/operators/CpuPool2d.h deleted file mode 100644 index 7feff91612..0000000000 --- a/src/runtime/cpu/operators/CpuPool2d.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_POOL2D_H -#define ARM_COMPUTE_CPU_POOL2D_H - -#include "arm_compute/core/experimental/Types.h" -#include "src/core/common/Macros.h" -#include "src/runtime/cpu/ICpuOperator.h" - -#include - -namespace arm_compute -{ -// Forward Declarations -struct PoolingLayerInfo; - -namespace cpu -{ -/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels: - * - * -# @ref NEFillBorderKernel (executed if padding size is different from zero) - * -# @ref kernels::CpuPool2dKernel - * -# @ref kernels::CpuPool2dAssemblyWrapperKernel - */ -class CpuPool2d : public ICpuOperator -{ -public: - CpuPool2d(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2d); - ~CpuPool2d(); - /** Set the src and dst tensors. - * - * @note F16 is supported for pool sizes 2 and 3 only - * - * @param[in, out] src Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * @param[out] dst Destination tensor info. Data types supported: same as @p src. - * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. - * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32. - */ - void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to CpuPool2d::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - experimental::MemoryRequirements workspace() const override; - -private: - std::unique_ptr _pooling_layer_kernel; - std::unique_ptr _border_handler; - std::unique_ptr _asm_glue; - - bool _is_global_pooling_layer; - DataLayout _data_layout; - experimental::MemoryRequirements _aux_mem{}; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_POOL2D_H */ diff --git a/src/runtime/cpu/operators/CpuQuantize.cpp b/src/runtime/cpu/operators/CpuQuantize.cpp deleted file mode 100644 index 5af7f6343b..0000000000 --- a/src/runtime/cpu/operators/CpuQuantize.cpp +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "src/runtime/cpu/operators/CpuQuantize.h" - -#include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/cpu/kernels/CpuQuantizeKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -Status CpuQuantize::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuQuantizeKernel::validate(src, dst)); - return Status{}; -} - -void CpuQuantize::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - - // Configure quantize kernel - auto k = std::make_unique(); - k->configure(src, dst); - _kernel = std::move(k); -} - -void CpuQuantize::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuQuantize.h b/src/runtime/cpu/operators/CpuQuantize.h deleted file mode 100644 index 9a34a36bcc..0000000000 --- a/src/runtime/cpu/operators/CpuQuantize.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_QUANTIZE_H -#define ARM_COMPUTE_CPU_QUANTIZE_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuQuantizeKernel that dequantizes an input tensor */ -class CpuQuantize : public ICpuOperator -{ -public: - /** Set the input and output tensors. - * - * @param[in] src Source tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16. - * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16 - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuQuantize::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_QUANTIZE_H */ diff --git a/src/runtime/cpu/operators/CpuReshape.cpp b/src/runtime/cpu/operators/CpuReshape.cpp deleted file mode 100644 index 33c9cb87b6..0000000000 --- a/src/runtime/cpu/operators/CpuReshape.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuReshape.h" - -#include "src/core/cpu/kernels/CpuReshapeKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuReshape::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique(); - k->configure(src, dst); - _kernel = std::move(k); -} - -Status CpuReshape::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::CpuReshapeKernel::validate(src, dst); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuReshape.h b/src/runtime/cpu/operators/CpuReshape.h deleted file mode 100644 index 581b55e0ef..0000000000 --- a/src/runtime/cpu/operators/CpuReshape.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_RESHAPE_H -#define ARM_COMPUTE_CPU_RESHAPE_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuReshapeKernel */ -class CpuReshape : public ICpuOperator -{ -public: - /** Configure operator for a given list of arguments - * - * @param[in] src Source tensor info. Data type supported: All - * @param[out] dst Destination info. Data type supported: Same as @p src - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuReshape::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_RESHAPE_H */ diff --git a/src/runtime/cpu/operators/CpuScale.cpp b/src/runtime/cpu/operators/CpuScale.cpp deleted file mode 100644 index 475cb2d4e8..0000000000 --- a/src/runtime/cpu/operators/CpuScale.cpp +++ /dev/null @@ -1,250 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuScale.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/cpu/kernels/CpuScaleKernel.h" -#include "src/core/utils/ScaleUtils.h" -#include "support/Rounding.h" - -namespace arm_compute -{ -namespace cpu -{ -namespace -{ -void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners) -{ - ARM_COMPUTE_ERROR_ON(offsets == nullptr); - float sampling_offset = 0.0f; - if(sampling_policy == SamplingPolicy::CENTER) - { - sampling_offset = 0.5f; - } - - Window win; - win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1)); - win.set(Window::DimY, Window::Dimension(0, offsets->info()->dimension(1), 1)); - - if(dx != nullptr && dy != nullptr) - { - // Pre-compute the offset and pixel's distance for BILINEAR interpolation - Iterator offsets_it(offsets, win); - Iterator dx_it(dx, win); - Iterator dy_it(dy, win); - - execute_window_loop(win, [&](const Coordinates & id) - { - const float in_x = (id.x() + sampling_offset) * wr - sampling_offset; - const float in_y = (id.y() + sampling_offset) * hr - sampling_offset; - const int in_xi = std::floor(in_x); - const int in_yi = std::floor(in_y); - - *reinterpret_cast(offsets_it.ptr()) = in_xi; - *reinterpret_cast(dx_it.ptr()) = in_x - in_xi; - *reinterpret_cast(dy_it.ptr()) = in_y - in_yi; - }, - offsets_it, dx_it, dy_it); - } - else - { - // Pre-compute the offset for NEAREST interpolation - Iterator offsets_it(offsets, win); - - execute_window_loop(win, [&](const Coordinates & id) - { - const float float_in_xi = (id.x() + sampling_offset) * wr; - const auto in_xi = static_cast(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) : std::floor(float_in_xi)); - *reinterpret_cast(offsets_it.ptr()) = in_xi; - }, - offsets_it); - } -} -} // namespace - -void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(CpuScale::validate(src, dst, info)); - - _scale_info = info; - _is_prepared = false; - - // Get data layout and width/height indices - _data_layout = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout; - const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - - // Compute the ratio between source width/height and destination width/height - const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy); - const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used); - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used); - - // Area interpolation behaves as Nearest Neighbour in case of up-sampling - InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f - && hr <= 1.f) ? - InterpolationPolicy::NEAREST_NEIGHBOR : - _scale_info.interpolation_policy; - - // Get the tensor shape - TensorShape shape(dst->dimension(idx_width)); - shape.set(1, dst->dimension(idx_height), false); - - TensorInfo tensor_info_offsets(shape, Format::S32); - TensorInfo tensor_info_dxdy(shape, Format::F32); - - auto dx = std::make_unique(tensor_info_dxdy); - auto dy = std::make_unique(tensor_info_dxdy); - auto offsets = std::make_unique(tensor_info_offsets); - auto scale_kernel = std::make_unique(); - switch(policy_to_use) - { - case InterpolationPolicy::NEAREST_NEIGHBOR: - { - scale_kernel->configure(src, nullptr, nullptr, offsets.get(), dst, info); - break; - } - case InterpolationPolicy::BILINEAR: - { - scale_kernel->configure(src, dx.get(), dy.get(), offsets.get(), dst, info); - break; - } - case InterpolationPolicy::AREA: - { - scale_kernel->configure(src, nullptr, nullptr, nullptr, dst, info); - break; - } - default: - ARM_COMPUTE_ERROR("Unsupported interpolation mode"); - } - _kernel = std::move(scale_kernel); -} - -Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT); - - ITensorInfo *offsets = nullptr; - ITensorInfo *dx = nullptr; - ITensorInfo *dy = nullptr; - - // Get data layout and width/height indices - const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - - // Compute the ratio between source width/height and destination width/height - const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy); - const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used); - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used); - - // Area interpolation behaves as Nearest Neighbour in case of up-sampling - InterpolationPolicy policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy; - - // Get the tensor shape of auxilary buffers - const TensorShape shape(dst->dimension(idx_width), dst->dimension(idx_height)); - TensorInfo tensor_info_offsets(shape, Format::S32); - TensorInfo tensor_info_dx(shape, Format::F32); - TensorInfo tensor_info_dy(shape, Format::F32); - switch(policy_to_use) - { - case InterpolationPolicy::NEAREST_NEIGHBOR: - offsets = &tensor_info_offsets; - break; - case InterpolationPolicy::BILINEAR: - offsets = &tensor_info_offsets; - dx = &tensor_info_dx; - dy = &tensor_info_dy; - break; - default: - break; - } - - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info)); - return Status{}; -} - -void CpuScale::prepare(ITensorPack &tensors) -{ - if(!_is_prepared) - { - _is_prepared = true; - const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - auto dx = tensors.get_tensor(TensorType::ACL_INT_0); - auto dy = tensors.get_tensor(TensorType::ACL_INT_1); - auto offsets = tensors.get_tensor(TensorType::ACL_INT_2); - - // Get data layout and width/height indices - const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - - // Compute the ratio between source width/height and destination width/height - const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy); - const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used); - const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used); - - // Area interpolation behaves as Nearest Neighbour in case of up-sampling - InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f - && hr <= 1.f) ? - InterpolationPolicy::NEAREST_NEIGHBOR : - _scale_info.interpolation_policy; - const SamplingPolicy sampling_policy = _scale_info.sampling_policy; - - switch(policy_to_use) - { - case InterpolationPolicy::NEAREST_NEIGHBOR: - { - // Pre-compute offsets for nearest interpolation - precompute_dx_dy_offsets(nullptr, nullptr, offsets, wr, hr, sampling_policy, is_align_corners_used); - break; - } - case InterpolationPolicy::BILINEAR: - { - // Pre-compute dx, dy and offsets for bilinear interpolation - precompute_dx_dy_offsets(dx, dy, offsets, wr, hr, sampling_policy, is_align_corners_used); - break; - } - case InterpolationPolicy::AREA: - { - break; - } - default: - ARM_COMPUTE_ERROR("Unsupported interpolation mode"); - } - } -} - -void CpuScale::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - prepare(tensors); - NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuScale.h b/src/runtime/cpu/operators/CpuScale.h deleted file mode 100644 index b83e04bc42..0000000000 --- a/src/runtime/cpu/operators/CpuScale.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_SCALE_H -#define ARM_COMPUTE_CPU_SCALE_H - -#include "arm_compute/core/ITensorInfo.h" -#include "arm_compute/core/KernelDescriptors.h" -#include "arm_compute/core/experimental/Types.h" -#include "src/core/cpu/ICpuKernel.h" -#include "src/runtime/cpu/ICpuOperator.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to compute Scale */ -class CpuScale : public ICpuOperator -{ -public: - /** Initialize the function's source, destination, interpolation type and border_mode. - * - * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED) - * @param[out] dst Destination tensor info. Data type supported: Same as @p src. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. - * @param[in] info @ref ScaleKernelInfo to be used for configuration - */ - void configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuScale::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info); - - // Inherited methods overridden: - void prepare(ITensorPack &tensors) override; - void run(ITensorPack &tensors) override; - -private: - ScaleKernelInfo _scale_info{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED }; - DataLayout _data_layout{ DataLayout::UNKNOWN }; - bool _is_prepared{ false }; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_SCALE_H */ diff --git a/src/runtime/cpu/operators/CpuSoftmax.cpp b/src/runtime/cpu/operators/CpuSoftmax.cpp deleted file mode 100644 index abbc539b19..0000000000 --- a/src/runtime/cpu/operators/CpuSoftmax.cpp +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuSoftmax.h" - -#include "arm_compute/core/Helpers.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/cpu/kernels/CpuSoftmaxKernel.h" -#include "src/core/helpers/MemoryHelpers.h" -#include "src/core/helpers/SoftmaxHelpers.h" -#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h" - -using namespace arm_compute::experimental; - -namespace arm_compute -{ -namespace cpu -{ -template -CpuSoftmaxGeneric::CpuSoftmaxGeneric() - : _permute_input(), - _permute_output(), - _max_kernel(), - _softmax_kernel(), - _max(), - _tmp(), - _input_permuted(), - _output_permuted(), - _needs_permute(false), - _aux_mem(InternalTensorIdx::COUNT) -{ -} - -template -void CpuSoftmaxGeneric::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis) -{ - // Perform validation step - ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_ERROR_THROW_ON(CpuSoftmaxGeneric::validate(src, dst, beta, axis)); - - const unsigned int actual_axis = static_cast(wrap_around(axis, static_cast(src->num_dimensions()))); - - _needs_permute = actual_axis > 0; - - if(_needs_permute) - { - _permute_input.configure(src, &_input_permuted, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); - } - - // We want to deal with a 2D input. Either it is the permuted version of the original input (4D case) - // or it is the original input case (2D case) - const ITensorInfo *tmp_input = (_needs_permute ? &_input_permuted : src); - - // Create intermediate tensors shapes - TensorShape max_sum_shape = tmp_input->tensor_shape(); - max_sum_shape.set(0, 1); - const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true); - DataType tmp_data_type = is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type(); - TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type)); - TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape)); - - // Init intermediate tensors - _max = TensorInfo(max_info); - _tmp = TensorInfo(tensor_info_tmp); - - // Configure kernels - auto mk = std::make_unique(); - mk->configure(tmp_input, &_max); - _max_kernel = std::move(mk); - - auto sm = std::make_unique>(); - if(_needs_permute) - { - // The normalization kernel stores the result in a permuted output tensor - sm->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp); - - // Re-permute the permuted output into the requested (4D) output - _permute_output.configure(&_output_permuted, dst, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis)); - } - else - { - // Softmax 2D case - sm->configure(tmp_input, &_max, dst, beta, &_tmp); - } - _softmax_kernel = std::move(sm); - - _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size()); - _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size()); - - _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _input_permuted.total_size()); - _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _output_permuted.total_size()); -} - -template -Status CpuSoftmaxGeneric::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis) -{ - // Perform validation step - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported"); - ARM_COMPUTE_UNUSED(beta); - ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast(-src->num_dimensions()) || static_cast(src->num_dimensions()) <= axis); - - // Create intermediate tensor info - DataType tmp_data_type = src->data_type(); - const TensorInfo tensor_info_tmp(src->clone()->set_data_type(tmp_data_type).set_is_resizable(true)); - - TensorShape max_sum_shape = src->tensor_shape(); - max_sum_shape.set(0, 1); - const TensorInfo tensor_info_max_sum(src->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(src->quantization_info()).set_is_resizable(true)); - const TensorInfo dont_care; - - const unsigned int actual_axis = static_cast(wrap_around(axis, static_cast(src->num_dimensions()))); - - const bool needs_permute = actual_axis > 0; - - if(needs_permute) - { - const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); - const TensorShape permuted_shape = misc::shape_calculator::compute_permutation_output_shape(*src, permutation_vector); - TensorInfo input_permuted(src->clone()->set_tensor_shape(permuted_shape)); - ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &input_permuted, permutation_vector)); - TensorInfo output_permuted(dst->clone()->set_tensor_shape(permuted_shape)); - ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&output_permuted, dst, permutation_vector)); - } - - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DMaxKernel::validate(src, &tensor_info_max_sum)); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel::validate(&tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care)); - - return Status{}; -} - -template -void CpuSoftmaxGeneric::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - - auto src = tensors.get_const_tensor(TensorType::ACL_SRC); - auto dst = tensors.get_tensor(TensorType::ACL_DST); - - CpuAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp, tensors, true); - CpuAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max, tensors, true); - - CpuAuxTensorHandler input_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _input_permuted, tensors, true); - CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors, true); - - ITensorPack max_pack; - ITensorPack softmax_pack; - - if(_needs_permute) - { - ITensorPack permute_in_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, input_permuted.get() } }; - _permute_input.run(permute_in_pack); - - max_pack = { { TensorType::ACL_SRC, input_permuted.get() }, { TensorType::ACL_DST, max.get() } }; - - softmax_pack = - { - { TensorType::ACL_SRC_0, input_permuted.get() }, - { TensorType::ACL_SRC_1, max.get() }, - { TensorType::ACL_DST_0, output_permuted.get() }, - { TensorType::ACL_DST_1, tmp.get() } - }; - } - else - { - max_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, max.get() } }; - - softmax_pack = - { - { TensorType::ACL_SRC_0, src }, - { TensorType::ACL_SRC_1, max.get() }, - { TensorType::ACL_DST_0, dst }, - { TensorType::ACL_DST_1, tmp.get() } - }; - } - - NEScheduler::get().schedule_op(_max_kernel.get(), Window::DimY, _max_kernel->window(), max_pack); - NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack); - - if(_needs_permute) - { - ITensorPack permute_out_pack; - permute_out_pack.add_tensor(TensorType::ACL_SRC, output_permuted.get()); - permute_out_pack.add_tensor(TensorType::ACL_DST, dst); - _permute_output.run(permute_out_pack); - } -} - -template -experimental::MemoryRequirements CpuSoftmaxGeneric::workspace() const -{ - return _aux_mem; -} - -template class CpuSoftmaxGeneric; -template class CpuSoftmaxGeneric; -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuSoftmax.h b/src/runtime/cpu/operators/CpuSoftmax.h deleted file mode 100644 index a9ac803c09..0000000000 --- a/src/runtime/cpu/operators/CpuSoftmax.h +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_SOFTMAX_H -#define ARM_COMPUTE_CPU_SOFTMAX_H - -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/experimental/Types.h" -#include "src/core/cpu/ICpuKernel.h" -#include "src/runtime/cpu/ICpuOperator.h" -#include "src/runtime/cpu/operators/CpuPermute.h" -#include - -namespace arm_compute -{ -namespace cpu -{ -class CpuLogits1DMaxKernel; -template -class CpuLogits1DSoftmaxKernel; - -/** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer. - * - * Softmax is calculated by : - * @f[ out = exp((x - max(x)) * beta) / sum(exp((x - max(x)) * beta)) @f] - * - * Log Softmax is calculated by : - * @f[ out = (x - max(x) * beta) - log(\sum{e^{x - max(x) * beta}}) @f] - * - * This function runs the following function/kernels: - * -# If axis is not 0: - * -# @ref CpuPermute - * -# @ref kernels::CpuLogits1DMaxKernel - * -# @ref kernels::CpuLogits1DSoftmaxKernel - */ -template -class CpuSoftmaxGeneric : public ICpuOperator -{ -public: - CpuSoftmaxGeneric(); - /** Set the input and output tensors. - * - * @param[in,out] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. - * last value of each row to the nearest multiple. - * @param[out] dst Destination tensor ifo. Data types supported: same as @p input. - * @param[in] beta (Optional) A scaling factor for the exponent. - * @param[in] axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and - * axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0 - */ - void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuSoftmaxGeneric::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - experimental::MemoryRequirements workspace() const override; - -private: - enum InternalTensorIdx - { - MAX = 0, - TMP, - PERMUTED_SRC, - PERMUTED_DST, - COUNT - }; - - CpuPermute _permute_input; - CpuPermute _permute_output; - std::unique_ptr _max_kernel; - std::unique_ptr _softmax_kernel; - - TensorInfo _max; - TensorInfo _tmp; - TensorInfo _input_permuted; - TensorInfo _output_permuted; - - bool _needs_permute; - experimental::MemoryRequirements _aux_mem{}; -}; -using CpuSoftmax = CpuSoftmaxGeneric; -using CpuLogSoftmax = CpuSoftmaxGeneric; - -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_SOFTMAX_H */ diff --git a/src/runtime/cpu/operators/CpuSub.cpp b/src/runtime/cpu/operators/CpuSub.cpp deleted file mode 100644 index 9baaaa9d67..0000000000 --- a/src/runtime/cpu/operators/CpuSub.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuSub.h" - -#include "src/core/cpu/kernels/CpuSubKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuSub::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_UNUSED(act_info); - auto k = std::make_unique(); - k->configure(src0, src1, dst, policy); - _kernel = std::move(k); -} - -Status CpuSub::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); - return kernels::CpuSubKernel::validate(src0, src1, dst, policy); -} -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuSub.h b/src/runtime/cpu/operators/CpuSub.h deleted file mode 100644 index 07f5be89cd..0000000000 --- a/src/runtime/cpu/operators/CpuSub.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_SUB_H -#define ARM_COMPUTE_CPU_SUB_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuSubKernel */ -class CpuSub : public ICpuOperator -{ -public: - /** Initialise the kernel's inputs, dst and conversion policy. - * - * Valid configurations (src0,src1) -> dst : - * - * - (U8,U8) -> U8 - * - (QASYMM8, QASYMM8) -> QASYMM8 - * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED - * - (S16,S16) -> S16 - * - (S32,S32) -> S32 - * - (F16,F16) -> F16 - * - (F32,F32) -> F32 - * - * @param[in] src0 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 - * @param[in] src1 Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 - * @param[out] dst Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 - * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. - */ - void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuSub::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_SUB_H */ \ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuTranspose.cpp b/src/runtime/cpu/operators/CpuTranspose.cpp deleted file mode 100644 index 51eeb90b8b..0000000000 --- a/src/runtime/cpu/operators/CpuTranspose.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuTranspose.h" - -#include "src/core/cpu/kernels/CpuTransposeKernel.h" - -namespace arm_compute -{ -namespace cpu -{ -void CpuTranspose::configure(const ITensorInfo *src, ITensorInfo *dst) -{ - auto k = std::make_unique(); - k->configure(src, dst); - _kernel = std::move(k); -} - -Status CpuTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst) -{ - return kernels::CpuTransposeKernel::validate(src, dst); -} -} // namesapce cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/CpuTranspose.h b/src/runtime/cpu/operators/CpuTranspose.h deleted file mode 100644 index 0735924839..0000000000 --- a/src/runtime/cpu/operators/CpuTranspose.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_TRANSPOSE_H -#define ARM_COMPUTE_CPU_TRANSPOSE_H - -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/** Basic function to run @ref kernels::CpuTransposeKernel */ -class CpuTranspose : public ICpuOperator -{ -public: - /** Configure operator for a given list of arguments - * - * @param[in] src Source tensor to permute. Data types supported: All - * @param[out] dst Destintation tensor. Data types supported: Same as @p src - */ - void configure(const ITensorInfo *src, ITensorInfo *dst); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref CpuTranspose::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst); -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_TRANSPOSE_H */ diff --git a/src/runtime/cpu/operators/CpuWinogradConv2d.cpp b/src/runtime/cpu/operators/CpuWinogradConv2d.cpp deleted file mode 100644 index 253280a951..0000000000 --- a/src/runtime/cpu/operators/CpuWinogradConv2d.cpp +++ /dev/null @@ -1,839 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/CpuWinogradConv2d.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" -#include "arm_compute/runtime/FunctionDescriptors.h" -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/CPP/Validate.h" -#include "src/core/NEON/kernels/convolution/common/utils.hpp" -#include "src/core/NEON/kernels/convolution/winograd/winograd.hpp" -#include "src/core/cpu/kernels/CpuWinogradConv2dKernel.h" -#include "src/core/helpers/MemoryHelpers.h" -#include "src/runtime/cpu/operators/CpuActivation.h" -#include "src/runtime/cpu/operators/CpuPermute.h" -#include "src/runtime/cpu/operators/CpuWinogradConv2d.h" -#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h" - -#include "support/Cast.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -using namespace arm_compute::experimental; -using namespace arm_compute::utils::cast; - -namespace -{ -arm_gemm::Activation arm_gemm_activation_from_acl_activation(const ActivationLayerInfo &act_info) -{ - switch(act_info.activation()) - { - case ActivationLayerInfo::ActivationFunction::RELU: - { - return arm_gemm::Activation(arm_gemm::Activation::Type::ReLU, act_info.a(), act_info.b()); - } - case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU: - { - return arm_gemm::Activation(arm_gemm::Activation::Type::BoundedReLU, act_info.a(), act_info.b()); - } - default: - { - return arm_gemm::Activation(arm_gemm::Activation::Type::None); - } - } -} - -inline Status validate_kernel_3x3(const Size2D input_dims, const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); - - if(src->data_type() == DataType::F32) - { - if(input_dims.width > 4 && input_dims.height > 4) - { - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel::validate(src, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel::validate(batched_mm_output, biases, dst, winograd_info))); - } - else - { - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel::validate(src, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel::validate(batched_mm_output, biases, dst, winograd_info))); - } - } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - else if(src->data_type() == DataType::F16) - { - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel<__fp16, 4, 4, 3, 3>::validate(src, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel<__fp16, 4, 4, 3, 3>::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel<__fp16, 4, 4, 3, 3>::validate(batched_mm_output, biases, dst, winograd_info))); - } -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - - if(act_info.enabled()) - { - CpuActivation::validate(dst, nullptr, act_info); - } - return Status{}; -} - -inline Status validate_kernel_5x5(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel::validate(src, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel::validate(batched_mm_output, biases, dst, winograd_info))); - if(act_info.enabled()) - { - CpuActivation::validate(dst, nullptr, act_info); - } - return Status{}; -} - -inline Status validate_kernel_3x1(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel::validate(src, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel::validate(batched_mm_output, biases, dst, winograd_info))); - if(act_info.enabled()) - { - CpuActivation::validate(dst, nullptr, act_info); - } - return Status{}; -} - -inline Status validate_kernel_1x3(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel::validate(src, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel::validate(batched_mm_output, biases, dst, winograd_info))); - - if(act_info.enabled()) - { - CpuActivation::validate(dst, nullptr, act_info); - } - return Status{}; -} - -inline Status validate_kernel_5x1(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel::validate(src, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel::validate(batched_mm_output, biases, dst, winograd_info))); - if(act_info.enabled()) - { - CpuActivation::validate(dst, nullptr, act_info); - } - return Status{}; -} -inline Status validate_kernel_1x5(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel::validate(src, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel::validate(batched_mm_output, biases, dst, winograd_info))); - if(act_info.enabled()) - { - CpuActivation::validate(dst, nullptr, act_info); - } - return Status{}; -} - -inline Status validate_kernel_7x1(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel::validate(src, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel::validate(batched_mm_output, biases, dst, winograd_info))); - if(act_info.enabled()) - { - CpuActivation::validate(dst, nullptr, act_info); - } - return Status{}; -} - -inline Status validate_kernel_1x7(const ITensorInfo *src, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output, - const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformInputKernel::validate(src, input0, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformWeightsKernel::validate(weights, input1, winograd_info))); - ARM_COMPUTE_RETURN_ON_ERROR((CpuWinogradConv2dTransformOutputKernel::validate(batched_mm_output, biases, dst, winograd_info))); - - if(act_info.enabled()) - { - CpuActivation::validate(dst, nullptr, act_info); - } - return Status{}; -} - -inline Tensor4DShape internal_get_input_shape(const ITensorInfo *src) -{ - const DataLayout data_layout = src->data_layout(); - const int in_width = src->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)); - const int in_height = src->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)); - const int in_channels = src->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)); - const int in_batches = src->dimension(3); - - return Tensor4DShape{ in_batches, in_height, in_width, in_channels }; -} - -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info) -{ - ARM_COMPUTE_UNUSED(dst); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides."); - if(biases != nullptr) - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); - ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - } - return ICpuWinogradConv2dTransformWeightsKernel::validate(src, weights); -} -Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataType data_type) -{ - Size2D output_tile = Size2D{}; - if(kernel_dims == Size2D(3U, 3U)) - { - output_tile = (input_dims.width <= 4 || input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U); - if(data_type == DataType::F16) - { - output_tile = Size2D(4U, 4U); - } - } - else if(kernel_dims == Size2D(5U, 5U)) - { - output_tile = Size2D(2U, 2U); - } - else if(kernel_dims == Size2D(1U, 3U)) - { - output_tile = Size2D(1U, 6U); - } - else if(kernel_dims == Size2D(3U, 1U)) - { - output_tile = Size2D(6U, 1U); - } - else if(kernel_dims == Size2D(1U, 5U)) - { - output_tile = Size2D(1U, 4U); - } - else if(kernel_dims == Size2D(5U, 1U)) - { - output_tile = Size2D(4U, 1U); - } - else if(kernel_dims == Size2D(7U, 1U)) - { - output_tile = Size2D(2U, 1U); - } - else if(kernel_dims == Size2D(1U, 7U)) - { - output_tile = Size2D(1U, 2U); - } - return output_tile; -} - -bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size, DataType data_type) -{ - // Check if we want to configure a Winograd configuration which requires fast math - using WinogradConfiguration = std::pair, std::pair>; - - const std::vector fast_math_winograd_f16 = - { - WinogradConfiguration(std::pair(4, 4), std::pair(3, 3)) - }; - - const std::vector fast_math_winograd_f32 = - { - WinogradConfiguration(std::pair(2, 2), std::pair(5, 5)), - WinogradConfiguration(std::pair(4, 4), std::pair(5, 5)) - }; - - auto p = std::make_pair(std::pair(output_tile.width, output_tile.height), - std::pair(kernel_size.width, kernel_size.height)); - - switch(data_type) - { - case DataType::F16: - return std::find(fast_math_winograd_f16.begin(), fast_math_winograd_f16.end(), p) != fast_math_winograd_f16.end(); - case DataType::F32: - return std::find(fast_math_winograd_f32.begin(), fast_math_winograd_f32.end(), p) != fast_math_winograd_f32.end(); - default: - return false; - } -} - -inline bool fuse_function_supported(const ActivationLayerInfo &act_info) -{ - return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU; -} - -} // namespace - -CpuWinogradConv2d::CpuWinogradConv2d() - : _gemm_function(std::make_unique()), - _activation_func(std::make_unique()), - _permute_input(std::make_unique()), - _permute_output(std::make_unique()), - _permute_weights(std::make_unique()), - _transform_input_kernel(nullptr), - _transform_weights_kernel(nullptr), - _transform_output_kernel(nullptr), - _data_layout(), - _aux_mem(AuxTensorIdx::Count), - _input_nhwc(), - _output_nhwc(), - _input_workspace(), - _kernel_storage(), - _output_workspace(), - _input_transformed(), - _output_transformed(), - _weights_hwio(), - _run_activation(false), - _is_prepared(false) -{ -} - -CpuWinogradConv2d::~CpuWinogradConv2d() = default; - -void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info)); - - // Get indices for the width and height - _data_layout = src->data_layout(); - const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); - - const Size2D input_dims = Size2D(src->dimension(width_idx), src->dimension(height_idx)); - const Size2D kernel_size = Size2D(weights->dimension(width_idx), weights->dimension(height_idx)); - const DataType data_type = src->data_type(); - const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, data_type); - - // Check if the Winograd configuration requires fast math - if(!enable_fast_math) - { - ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size, data_type), - "This Winograd configuration requires enable_fast_math=true"); - } - - _is_prepared = false; - - std::unique_ptr transform_input_kernel; - std::unique_ptr transform_weights_kernel; - std::unique_ptr transform_output_kernel; - - int n_gemms = 1; - int N_BLOCK = 1; // Size of block used by GEMM. - if(data_type == DataType::F32) - { - if(kernel_size == Size2D(3, 3)) - { - if(src->dimension(width_idx) > 4 && src->dimension(height_idx) > 4) - { - using config = CpuWinogradConv2dConfiguration; - transform_input_kernel = std::make_unique(); - transform_weights_kernel = std::make_unique(); - transform_output_kernel = std::make_unique(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else - { - using config = CpuWinogradConv2dConfiguration; - transform_input_kernel = std::make_unique(); - transform_weights_kernel = std::make_unique(); - transform_output_kernel = std::make_unique(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - } - else if(kernel_size == Size2D(5, 5)) - { - using config = CpuWinogradConv2dConfiguration; - transform_input_kernel = std::make_unique(); - transform_weights_kernel = std::make_unique(); - transform_output_kernel = std::make_unique(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else if(kernel_size == Size2D(1, 3)) - { - using config = CpuWinogradConv2dConfiguration; - transform_input_kernel = std::make_unique(); - transform_weights_kernel = std::make_unique(); - transform_output_kernel = std::make_unique(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else if(kernel_size == Size2D(3, 1)) - { - using config = CpuWinogradConv2dConfiguration; - transform_input_kernel = std::make_unique(); - transform_weights_kernel = std::make_unique(); - transform_output_kernel = std::make_unique(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else if(kernel_size == Size2D(1, 5)) - { - using config = CpuWinogradConv2dConfiguration; - transform_input_kernel = std::make_unique(); - transform_weights_kernel = std::make_unique(); - transform_output_kernel = std::make_unique(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else if(kernel_size == Size2D(5, 1)) - { - using config = CpuWinogradConv2dConfiguration; - transform_input_kernel = std::make_unique(); - transform_weights_kernel = std::make_unique(); - transform_output_kernel = std::make_unique(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else if(kernel_size == Size2D(1, 7)) - { - using config = CpuWinogradConv2dConfiguration; - transform_input_kernel = std::make_unique(); - transform_weights_kernel = std::make_unique(); - transform_output_kernel = std::make_unique(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else if(kernel_size == Size2D(7, 1)) - { - using config = CpuWinogradConv2dConfiguration; - transform_input_kernel = std::make_unique(); - transform_weights_kernel = std::make_unique(); - transform_output_kernel = std::make_unique(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else - { - ARM_COMPUTE_ERROR("Not supported."); - } - } -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - else if(data_type == DataType::F16) - { - if(kernel_size == Size2D(3, 3)) - { - using config = CpuWinogradConv2dConfiguration<__fp16, __fp16, 4, 4, 3, 3>; - transform_input_kernel = std::make_unique(); - transform_weights_kernel = std::make_unique(); - transform_output_kernel = std::make_unique(); - n_gemms = config::WinogradBase::N_GEMMS; - N_BLOCK = config::WinogradConv::N_BLOCK; - } - else - { - ARM_COMPUTE_ERROR("Not supported."); - } - } -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - else - { - ARM_COMPUTE_ERROR("Not supported."); - } - - const PaddingType use_padding_type = (conv_info.pad_top() != 0u || conv_info.pad_left() != 0) ? PADDING_SAME : PADDING_VALID; - const bool use_same_padding = use_padding_type == PADDING_SAME; - - // Get convolved dimensions - const int in_channels = src->dimension(channel_idx); - const int out_channels = dst->dimension(channel_idx); - - const Tensor4DShape in_shape(internal_get_input_shape(src)); - const size_t data_type_size = src->element_size(); - // Get the memory required to instantiate a new Winograd operator. - constexpr size_t storage_alignment = 64; - - // Kernel Storage - const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size; - - // Input storage - const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size; - - // Output storage - const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels) * data_type_size; - const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(out_channels, in_channels); - const int output_matrix_stride = transform_output_kernel->get_matrix_stride(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels); - const auto output_shape = transform_output_kernel->get_output_shape(in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME); - const int input_matrix_stride = transform_input_kernel->get_matrix_stride(in_shape.n_batches, in_channels, in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME); - - // Configure GEMM - const int tile_rows = iceildiv(output_shape.first, output_tile.height); - const int tile_cols = iceildiv(output_shape.second, output_tile.width); - const int m = in_shape.n_batches * tile_rows * tile_cols; - const int k = in_shape.n_channels; - const int n = out_channels; - const int kernel_matrix_row_stride = roundup(out_channels, N_BLOCK); - const int output_matrix_row_stride = kernel_matrix_row_stride; - - TensorShape a_shape(k, m, 1, n_gemms); - Strides a_strides(data_type_size); - a_strides.set(1, a_strides[0] * k); - //a_strides.set(2, data_type_size * input_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0. - a_strides.set(2, 0); - a_strides.set(3, data_type_size * input_matrix_stride); - - TensorShape b_shape(n, k, n_gemms); - Strides b_strides(data_type_size); - b_strides.set(1, data_type_size * kernel_matrix_row_stride); - b_strides.set(2, data_type_size * kernel_matrix_stride); - - TensorShape d_shape(n, m, 1, n_gemms); - Strides d_strides(data_type_size); - d_strides.set(1, data_type_size * output_matrix_row_stride); - //d_strides.set(2, data_type_size * output_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0. - d_strides.set(2, 0); - d_strides.set(3, data_type_size * output_matrix_stride); - - TensorInfo a_info{}; - TensorInfo b_info{}; - TensorInfo d_info{}; - a_info.init(a_shape, 1, data_type, a_strides, 0, input_storage_size); - b_info.init(b_shape, 1, data_type, b_strides, 0, kernel_storage_size); - d_info.init(d_shape, 1, data_type, d_strides, 0, output_storage_size); - - _input_transformed = a_info; - _kernel_storage = b_info; - _output_transformed = d_info; - - const ITensorInfo *input_to_use = src; - ITensorInfo *output_to_use = dst; - PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U); - const unsigned int max_num_threads = NEScheduler::get().num_threads(); - - // Configure the kernel to transform the input tensor from NCHW -> NHWC - if(_data_layout == DataLayout::NCHW) - { - _permute_input->configure(src, &_input_nhwc, PermutationVector(2U, 0U, 1U)); - input_to_use = &_input_nhwc; - weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U); - } - - // Configure input transform kernel - transform_input_kernel->configure(input_to_use, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type, - &_input_transformed, input_matrix_stride, &_input_workspace); - const size_t input_workspace_size = transform_input_kernel->get_working_space_size(max_num_threads); - TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, DataType::U8); - _input_workspace = input_workspace_info; - - // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map] - _permute_weights->configure(weights, &_weights_hwio, weights_permutation_vector); - transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels); - - // Configure GEMM function - _gemm_function->configure(&_input_transformed, &_kernel_storage, nullptr, &_output_transformed, 1.0f, 0.f); - - // Configure output transform function - // The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method - if(_data_layout == DataLayout::NCHW) - { - // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output() - TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0), - dst->dimension(1), dst->dimension(3)), - 1, dst->data_type()); - _output_nhwc = info; - output_to_use = &_output_nhwc; - } - const arm_gemm::Activation activation = arm_gemm_activation_from_acl_activation(act_info); - - transform_output_kernel->configure(biases, - &_output_transformed, - output_matrix_stride, - output_to_use, - in_shape.n_batches, - output_shape.first, - output_shape.second, - out_channels, - &_output_workspace, - activation); - - const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads); - TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, DataType::U8); - _output_workspace = output_workspace_info; - - // Reorder the convoluted output to ACL's ordering NCHW - if(_data_layout == DataLayout::NCHW) - { - _permute_output->configure(&_output_nhwc, dst, PermutationVector(1U, 2U, 0U)); - } - - _transform_input_kernel = std::move(transform_input_kernel); - _transform_weights_kernel = std::move(transform_weights_kernel); - _transform_output_kernel = std::move(transform_output_kernel); - - //Configure Activation Layer - _run_activation = act_info.enabled() && !fuse_function_supported(act_info); - if(_run_activation) - { - _activation_func->configure(dst, nullptr, act_info); - } - - auto asm_mem_req = _gemm_function->workspace(); - _aux_mem[GemmWorkspace] = asm_mem_req[GemmWorkspace]; - _aux_mem[Pretranspose] = asm_mem_req[Pretranspose]; - _aux_mem[InterleavedLHS] = asm_mem_req[InterleavedLHS]; - _aux_mem[TransposedRHS] = asm_mem_req[TransposedRHS]; - _aux_mem[TempResult] = asm_mem_req[TempResult]; - - // Request temporary memory. Overlap memory needed for Input/Output transformations as they run on different non-overlapping time-steps. - _aux_mem[TransformedInput] = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary, input_storage_size, storage_alignment); - _aux_mem[TransformedOutput] = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary, output_storage_size, storage_alignment); - _aux_mem[WorkspaceIO] = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary, std::max(input_workspace_size, output_workspace_size)); - _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size()); - _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent, kernel_storage_size, storage_alignment); - if(_data_layout == DataLayout::NCHW) - { - _aux_mem[PermutedInput].merge(offset_int_vec(PermutedInput), src->total_size()); - _aux_mem[PermutedOutput].merge(offset_int_vec(PermutedOutput), dst->total_size()); - } -} - -Status CpuWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info)); - - // Get indices for the width and height - const size_t idx_width = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_height = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT); - - // Input shape, kernel size and output tile - const Size2D input_dims = Size2D(src->dimension(idx_width), src->dimension(idx_height)); - const Size2D kernel_size = Size2D(weights->dimension(idx_width), weights->dimension(idx_height)); - const DataType data_type = src->data_type(); - const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, data_type); - - // Check if the Winograd configuration requires fast math - if(!enable_fast_math) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size, data_type), - "This Winograd configuration requires enable_fast_math=true"); - } - - const WinogradInfo winograd_info = WinogradInfo(output_tile, - kernel_size, - input_dims, - conv_info, - src->data_layout()); - - // Validate input transform - const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info); - const TensorInfo input0 = src->clone()->set_tensor_shape(input0_shape); - // Validate filter transform - const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info); - const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape); - // Validate batched matrix multiply - TensorShape batched_mm_output_shape = input0.tensor_shape(); - batched_mm_output_shape[0] = input1.tensor_shape()[0]; - const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape); - - if(kernel_size == Size2D(3, 3)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported"); - return validate_kernel_3x3(input_dims, src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info); - } - else if(kernel_size == Size2D(5, 5)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported"); - return validate_kernel_5x5(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info); - } - if(kernel_size == Size2D(3, 1)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported"); - return validate_kernel_3x1(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info); - } - else if(kernel_size == Size2D(1, 3)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported"); - return validate_kernel_1x3(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info); - } - else if(kernel_size == Size2D(5, 1)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported"); - return validate_kernel_5x1(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info); - } - else if(kernel_size == Size2D(1, 5)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported"); - return validate_kernel_1x5(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info); - } - else if(kernel_size == Size2D(7, 1)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 3, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 3, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported"); - return validate_kernel_7x1(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info); - } - else if(kernel_size == Size2D(1, 7)) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 3, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 3, "Only SAME or VALID padding supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported"); - return validate_kernel_1x7(src, &input0, &input1, &batched_mm_output, weights, biases, dst, winograd_info, act_info); - } - else - { - ARM_COMPUTE_RETURN_ERROR_MSG("Kernel shape not supported"); - } -} - -void CpuWinogradConv2d::run(ITensorPack &tensors) -{ - prepare(tensors); - - auto a = tensors.get_const_tensor(ACL_SRC_0); - auto c = tensors.get_const_tensor(ACL_SRC_2); - auto d = tensors.get_tensor(ACL_DST); - - CpuAuxTensorHandler input_nhwc(offset_int_vec(PermutedInput), _input_nhwc, tensors, true); - CpuAuxTensorHandler input_transformed(offset_int_vec(TransformedInput), _input_transformed, tensors, true); - CpuAuxTensorHandler input_workspace(offset_int_vec(WorkspaceIO), _input_workspace, tensors, true); - - const bool is_nchw = _data_layout == DataLayout::NCHW; - if(is_nchw) - { - //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC - ITensorPack pack{ { ACL_SRC, a }, { ACL_DST, input_nhwc.get() } }; - _permute_input->run(pack); - } - - // Transform input tensor to the winograd domain - ITensorPack transform_input_pack{ { ACL_SRC, is_nchw ? input_nhwc.get() : a }, { ACL_DST, input_transformed.get() }, { ACL_INT, input_workspace.get() } }; - NEScheduler::get().schedule_op(_transform_input_kernel.get(), Window::DimX, _transform_input_kernel->window(), transform_input_pack); - - CpuAuxTensorHandler output_transformed(offset_int_vec(TransformedOutput), _output_transformed, tensors, true); - CpuAuxTensorHandler weights_transformed(offset_int_vec(TransformedWeights), _kernel_storage, tensors, true); - - // Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs - ITensorPack gemm_pack = tensors; - gemm_pack.add_const_tensor(ACL_SRC, input_transformed.get()); - gemm_pack.add_const_tensor(ACL_SRC_1, weights_transformed.get()); - gemm_pack.add_const_tensor(ACL_BIAS, nullptr); - gemm_pack.add_tensor(ACL_DST, output_transformed.get()); - _gemm_function->run(gemm_pack); - - // Transform output tensor to the spatial domain - CpuAuxTensorHandler output_workspace(offset_int_vec(WorkspaceIO), _output_workspace, tensors, true); - CpuAuxTensorHandler output_nhwc(offset_int_vec(PermutedOutput), _output_nhwc, tensors, true); - ITensorPack transform_output_pack{ { ACL_SRC_0, c }, { ACL_SRC_1, output_transformed.get() }, { ACL_DST, is_nchw ? output_nhwc.get() : d }, { ACL_INT, output_workspace.get() } }; - NEScheduler::get().schedule_op(_transform_output_kernel.get(), Window::DimX, _transform_output_kernel->window(), transform_output_pack); - - if(is_nchw) - { - // Reorder the convoluted output to ACL's ordering NCHW - ITensorPack pack{ { ACL_SRC, output_nhwc.get() }, { ACL_DST, d } }; - _permute_output->run(pack); - } - - if(_run_activation) - { - ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } }; - _activation_func->run(pack); - } -} - -void CpuWinogradConv2d::prepare(ITensorPack &tensors) -{ - if(!_is_prepared) - { - // Permute weights - const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1); - ITensor *weights_aux = utils::cast::polymorphic_cast(tensors.get_tensor(offset_int_vec(PermutedWeights))); - ARM_COMPUTE_ERROR_ON_NULLPTR(weights, weights_aux); - - CpuAuxTensorHandler permuted_weights(_weights_hwio, *weights_aux); - ITensorPack permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } }; - _permute_weights->run(permute_tensors); - - // Transform weights - ITensor *weights_transf = utils::cast::polymorphic_cast(tensors.get_tensor(offset_int_vec(TransformedWeights))); - ARM_COMPUTE_ERROR_ON_NULLPTR(weights_transf); - - CpuAuxTensorHandler transformed_weights(_kernel_storage, *weights_transf); - ITensorPack transform_tensors{ { ACL_SRC, permuted_weights.get() }, { ACL_DST, transformed_weights.get() } }; - NEScheduler::get().schedule_op(_transform_weights_kernel.get(), Window::DimX, _transform_weights_kernel->window(), transform_tensors); - - ITensorPack gemm_pack = tensors; - gemm_pack.add_const_tensor(ACL_SRC_1, transformed_weights.get()); - _gemm_function->prepare(gemm_pack); - - _is_prepared = true; - } -} - -experimental::MemoryRequirements CpuWinogradConv2d::workspace() const -{ - return _aux_mem; -} -} // namespace cpu -} // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/cpu/operators/CpuWinogradConv2d.h b/src/runtime/cpu/operators/CpuWinogradConv2d.h deleted file mode 100644 index b5b9c3f2e3..0000000000 --- a/src/runtime/cpu/operators/CpuWinogradConv2d.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_WINOGRAD_CONV2D_KERNEL_H -#define ARM_COMPUTE_CPU_WINOGRAD_CONV2D_KERNEL_H - -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/runtime/FunctionDescriptors.h" -#include "src/core/common/Macros.h" -#include "src/core/cpu/kernels/CpuWinogradConv2dKernel.h" -#include "src/runtime/cpu/ICpuOperator.h" -#include "src/runtime/cpu/operators/CpuActivation.h" -#include "src/runtime/cpu/operators/CpuGemm.h" -#include "src/runtime/cpu/operators/CpuPermute.h" -#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h" - -namespace arm_compute -{ -namespace cpu -{ -class CpuWinogradConv2d : public ICpuOperator -{ -public: - /** Constructor */ - CpuWinogradConv2d(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuWinogradConv2d); - /** Destructor */ - ~CpuWinogradConv2d(); - - /** Set the input and output tensors. - * - * Valid data layouts: - * - NHWC - * - NCHW - * - * Valid data type configurations: - * |src0 |src1 |src2 |dst | - * |:--------------|:--------------|:------|:--------------| - * |F16 |F16 |F16 |F16 | - * |F32 |F32 |F32 |F32 | - * - * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: F16/F32. - * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input. - * Currently only 3x3 and 5x5 kernels are supported. - * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. - * @param[out] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. - * Data types supported: Same as @p input. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. Currently only unit strides are supported. - * @param[in] act_info (Optional) Activation layer information in case of a fused activation. - * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation - * available which may introduce a drop of accuracy as well. Default is false - */ - void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo(), - bool enable_fast_math = false); - /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2d - * - * Similar to CpuWinogradConv2d::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo(), - bool enable_fast_math = false); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; - experimental::MemoryRequirements workspace() const override; - -private: - enum AuxTensorIdx - { - GemmWorkspace = 0, - Pretranspose = 1, - InterleavedLHS = 2, - TransposedRHS = 3, - TempResult = 4, - TransformedInput = 5, - TransformedOutput = 6, - WorkspaceIO = 7, - TransformedWeights = 8, - PermutedWeights = 9, - PermutedInput = TransformedOutput, - PermutedOutput = TransformedInput, - Count = 10 - }; - - std::unique_ptr _gemm_function; - std::unique_ptr _activation_func; - std::unique_ptr _permute_input; - std::unique_ptr _permute_output; - std::unique_ptr _permute_weights; - std::unique_ptr _transform_input_kernel; - std::unique_ptr _transform_weights_kernel; - std::unique_ptr _transform_output_kernel; - - DataLayout _data_layout; - experimental::MemoryRequirements _aux_mem{ Count }; - TensorInfo _input_nhwc; - TensorInfo _output_nhwc; - TensorInfo _input_workspace; - TensorInfo _kernel_storage; - TensorInfo _output_workspace; - TensorInfo _input_transformed; - TensorInfo _output_transformed; - TensorInfo _weights_hwio; - bool _run_activation; - bool _is_prepared; -}; -} // namespace cpu -} // namespace arm_compute - -#endif /* ARM_COMPUTE_CPU_WINOGRAD_CONV2D_KERNEL_H */ diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp deleted file mode 100644 index 9786161dee..0000000000 --- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp +++ /dev/null @@ -1,721 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h" - -#include "arm_compute/runtime/NEON/NEScheduler.h" -#include "src/core/CPP/Validate.h" -#include "src/core/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h" -#include "src/core/cpu/kernels/assembly/arm_gemm.hpp" -#include "src/core/helpers/MemoryHelpers.h" -#include "src/core/utils/AssemblyUtils.h" -#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h" - -#include - -namespace arm_compute -{ -namespace cpu -{ -using namespace arm_compute::experimental; - -namespace -{ -struct free_delete -{ - void operator()(void *x) - { - free(x); - } -}; - -struct Params -{ - unsigned int M; - unsigned int N; - unsigned int K; - unsigned int batches; - unsigned int multis; - unsigned int sections; - bool indirect; -}; - -Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); - Params p; - p.M = d->tensor_shape().y(); - p.K = a->tensor_shape().x(); - p.N = d->tensor_shape().x(); - p.batches = 1; - p.multis = 1; - p.sections = 1; - p.indirect = false; - - if(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect) - { - p.indirect = true; - p.sections = b->tensor_shape()[2] * b->tensor_shape()[3]; - } - else - { - p.multis = b->tensor_shape().z(); - p.batches = d->tensor_shape().total_size_upper(2) / p.multis; - } - - // Update M in case of GEMM3D for output - if(info.depth_output_gemm3d != 0) - { - p.M = d->tensor_shape().y() * d->tensor_shape().z(); - p.batches = d->tensor_shape().total_size_upper(3) / p.multis; - } - - return p; -} - -IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataType data_type) -{ - // Schedule assembly kernel - const int granule_threshold = 200; - IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX); - if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32) - { - scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold); - } - else if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || data_type == DataType::S8)) - { - //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions - scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); - } - else if(method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED)) - { - //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case - scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); - } - - return scheduling_hint; -} - -/** Fallback in case ACL doesn't have a function */ -template -class Fallback : public CpuGemmAssemblyDispatch::IFallback -{ -public: - /** Destructor */ - ~Fallback() = default; - - /** Initialise the functions's input and output. - * - * @param[in] a Input tensor containing the Matrix A. - * @param[in] b Input tensor containing the Matrix B. - * @param[in] c Input tensor containing the Matrix C. - * @param[out] d Output tensor to store the result of matrix multiplication. - * @param[in] args Matrix multiplication information. - * @param[in] gemm_info GEMM meta-data - * @param[in] os Output stage meta-data. - */ - void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, - arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info, - const OutputStage &os = {}); - - /** Set requantization shifts to be used - * - * @param[in] shifts Requantization shifts - * - * @return Pointer to the shift data - */ - /** Set requantization data to be used - * - * - * @param shifts Requantization shifts - * @param multipliers Requantization multipliers - * - * @return A tuple with the pointers to the shift and multiplier data respectively - */ - std::tuple set_requantize_data(const std::vector &shifts, - const std::vector &multipliers); - - // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; - bool is_configured() const override; - experimental::MemoryRequirements workspace() const override; - -private: - enum AuxTensorIdx - { - AsmGemmWorkspace = 0, - Pretranspose, - Count - }; - - /** Configure the indirect buffer - * - * @param[in] a Input tensor containing the Matrix A. - * @param[in] b Input tensor containing the Matrix B. - * @param[out] d Output tensor to store the result of matrix multiplication. - * @param[in] info GEMM meta-data - */ - void configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info); - /** Prepare the indirect buffer */ - void prepare_indirect_buffer(ITensorPack &tensors); - - /** Assembly Gemm kernel */ - std::shared_ptr> _gemm_kernel_asm{ nullptr }; - /** Optimised Arm® Neon™ kernel */ - std::unique_ptr _optimised_kernel{ nullptr }; - /** Assembly GEMM workspace tensor info */ - TensorInfo _workspace_info{}; - /** Pre-transpose tensor info */ - TensorInfo _pretranspose_info{}; - /** Prepared flag */ - bool _is_prepared{ false }; - /** GEMM meta-data */ - AsmGemmInfo _gemm_info{}; - /** GEMM kernel description */ - arm_gemm::KernelDescription _kernel_info{}; - /** Per channel quantization shifts */ - std::vector _shifts{}; - std::vector right_shifts{}; - std::vector left_shifts{}; - /** Per channel quantization multipliers */ - std::vector _multipliers{}; - /** Indirect buffer */ - std::unique_ptr _indirect_arg{}; - std::unique_ptr _indirect_buf{}; - std::vector _indirect_pad{}; - arm_gemm::ConvolutionParameters _cp{}; - experimental::MemoryRequirements _aux_mem{ Count }; -}; - -template -std::tuple -Fallback::set_requantize_data(const std::vector &shifts, const std::vector &multipliers) -{ - _multipliers = multipliers; - _shifts = shifts; - bool need_left = false; - for(const auto s : _shifts) - { - left_shifts.push_back(std::max(-s, int32_t(0))); - right_shifts.push_back(std::min(-s, int32_t(0))); - if(s < 0 && !need_left) - { - need_left = true; - } - } - return std::make_tuple(need_left, left_shifts.data(), right_shifts.data(), _multipliers.data()); -} - -template -void Fallback::prepare_indirect_buffer(ITensorPack &tensors) -{ - auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0); - const TypeInput *A_ptr = reinterpret_cast(a->buffer()); - const int multis = 1; - const int batches = a->info()->tensor_shape().total_size_upper(3); - const size_t stride_A = a->info()->strides_in_bytes().y() / sizeof(TypeInput); - const size_t batch_stride_A = a->info()->strides_in_bytes()[3] / sizeof(TypeInput); - const size_t multi_stride_A = a->info()->strides_in_bytes()[4] / sizeof(TypeInput); - - const size_t output_hw = _cp.output_height * _cp.output_width; - const int batch_size = _cp.kernel_height * _cp.kernel_width * output_hw * sizeof(TypeInput); - const size_t batch_stride = batch_size / sizeof(TypeInput); - const int multi_size = batch_size * batches; - const size_t multi_stride = multi_size / sizeof(TypeInput); - - for(int64_t m = 0; m < multis; m++) - { - for(int64_t b = 0; b < batches; b++) - { - for(int64_t output_y = 0; output_y < _cp.output_height; output_y++) - { - for(int64_t output_x = 0; output_x < _cp.output_width; output_x++) - { - int64_t output_xy = (output_y * _cp.output_width) + output_x; - - for(int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++) - { - for(int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++) - { - int64_t input_x = (output_x * _cp.output_stride_w) + kernel_x - _cp.padding_left; - int64_t input_y = (output_y * _cp.output_stride_h) + kernel_y - _cp.padding_top; - int64_t kernel_xy = (kernel_y * _cp.kernel_width) + kernel_x; - int64_t input_xy = (input_y * _cp.input_width) + input_x; - - if(input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height) - { - _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data(); - } - else - { - _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = - A_ptr + (m * multi_stride_A + b * batch_stride_A + input_xy * stride_A); - } - } - } - } - } - } - } -} - -template -void Fallback::configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info) -{ - ARM_COMPUTE_ERROR_ON(!(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)); - - float zeropad = 0.f; - if(is_data_type_quantized(a->data_type())) - { - zeropad = a->quantization_info().uniform().offset; - } - - const int64_t input_width = static_cast(a->tensor_shape()[1]); - const int64_t input_height = static_cast(a->tensor_shape()[2]); - const int64_t input_channels = static_cast(a->tensor_shape()[0]); - const int64_t kernel_width = static_cast(b->tensor_shape()[2]); - const int64_t kernel_height = static_cast(b->tensor_shape()[3]); - const int64_t output_width = static_cast(d->tensor_shape()[1]); - const int64_t output_height = static_cast(d->tensor_shape()[2]); - - _cp = { input_width, input_height, input_channels, kernel_width, kernel_height, output_width, output_height, - info.ps_info.stride().first, info.ps_info.stride().second, info.padding_top, info.padding_left, zeropad - }; - - if(info.method == AsmConvMethod::Conv) - { - _gemm_kernel_asm->set_convolution_parameters(_cp); - } - - if(info.method == AsmConvMethod::Indirect) - { - const unsigned int multis = 1; - const unsigned int batches = a->tensor_shape().total_size_upper(3); - const unsigned int kernel_hw = _cp.kernel_width * _cp.kernel_height; - const unsigned int output_hw = _cp.output_width * _cp.output_height; - - using TypeInputPtr = TypeInput *; - const int batch_size = kernel_hw * output_hw * sizeof(TypeInputPtr); - const size_t batch_stride = batch_size / sizeof(TypeInputPtr); - const int multi_size = batch_size * batches; - const size_t multi_stride = multi_size / sizeof(TypeInputPtr); - - _indirect_buf = std::unique_ptr(reinterpret_cast(malloc(multi_size * multis))); - _indirect_arg = std::unique_ptr(reinterpret_cast(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches))); - _indirect_pad = std::vector(_cp.input_channels, TypeInput(zeropad)); - - // Set indirect argument - int64_t pos = 0; - for(int64_t m = 0; m < multis; m++) - { - for(int64_t b = 0; b < batches; b++) - { - for(int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++) - { - (_indirect_arg.get())[pos++] = _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw; - } - } - } - - _gemm_kernel_asm->set_indirect_parameters(a->tensor_shape()[0], _indirect_arg.get()); - } -} - -template -void Fallback::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, - arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info, - const OutputStage &os) -{ - ARM_COMPUTE_UNUSED(c); - arm_gemm::GemmConfig gemm_cfg; - _kernel_info = arm_gemm::get_gemm_method(args, os); - if(_kernel_info.method != arm_gemm::GemmMethod::GEMV_BATCHED) - { - gemm_cfg.filter = _kernel_info.name; - args._cfg = &gemm_cfg; - } - _gemm_kernel_asm = arm_gemm::gemm(args, os); - if(_gemm_kernel_asm == nullptr) - { - //configuration not supported: Leave function unconfigured: - return; - } - - // arm_compute wrapper for the Gemm object (see above) - auto acl_gemm_wrapper = std::make_unique>(); - ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr); - acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter); - const size_t workspace_size = _gemm_kernel_asm->get_working_size(); - const unsigned int alignment = 4096; - _workspace_info = TensorInfo(TensorShape(workspace_size), 1, DataType::U8); - _aux_mem[AsmGemmWorkspace] = MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment); - - //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and - //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001 - { - const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size(); - if(window_size < static_cast(args._maxthreads)) - { - _gemm_kernel_asm->set_nthreads(window_size); - } - } - - _optimised_kernel = std::move(acl_gemm_wrapper); - _gemm_info = gemm_info; - // Check for pre-transposed support - if(_gemm_kernel_asm->B_pretranspose_required()) - { - // Forcing 128-byte alignment (required by 32-bit kernels) - const unsigned int alignment = 128; - const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size(); - _pretranspose_info = TensorInfo(TensorShape(B_pretranspose_size), 1, DataType::U8); - _aux_mem[Pretranspose] = MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment); - } - - // Handle indirect GEMM convolution - if(gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect) - { - configure_indirect(a, b, d, gemm_info); - } -} - -template -void Fallback::prepare(ITensorPack &tensors) -{ - if(!_is_prepared) - { - auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); - - // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C. - if(c && c->info()->data_type() == DataType::S32) - { - _gemm_kernel_asm->set_quantized_bias(reinterpret_cast(c->buffer() + c->info()->offset_first_element_in_bytes()), 0); - } - - // Pretranspose B if required - if(_gemm_kernel_asm->B_pretranspose_required()) - { - const int ldb = b->info()->strides_in_bytes().y() / sizeof(TypeInput); - const auto in1_ptr = reinterpret_cast(b->buffer() + b->info()->offset_first_element_in_bytes()); - const int multi_stride_b = b->info()->strides_in_bytes().z() / sizeof(TypeInput); - - CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false); - ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr); - _gemm_kernel_asm->pretranspose_B_array(pretranspose.get()->buffer(), in1_ptr, ldb, multi_stride_b); - - b->mark_as_unused(); - } - - if(_gemm_info.method == AsmConvMethod::Indirect) - { - prepare_indirect_buffer(tensors); - } - - _is_prepared = true; - } -} - -template -bool Fallback::is_configured() const -{ - return _optimised_kernel != nullptr; -} - -template -experimental::MemoryRequirements Fallback::workspace() const -{ - return _aux_mem; -} - -template -void Fallback::run(ITensorPack &tensors) -{ - auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0); - auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); - auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); - auto d = tensors.get_tensor(TensorType::ACL_DST); - - int lda = a->info()->strides_in_bytes().y() / sizeof(TypeInput); - int ldb = 0; - const int ldd = d->info()->strides_in_bytes().y() / sizeof(TypeOutput); - - const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d != 0 ? 3 : 2; - const size_t a_multi_idx = a_batch_idx + 1; - const size_t d_batch_idx = _gemm_info.depth_output_gemm3d != 0 ? 3 : 2; - const size_t d_multi_idx = d_batch_idx + 1; - - int batch_stride_a = a->info()->strides_in_bytes()[a_batch_idx] / sizeof(TypeInput); - const int batch_stride_d = d->info()->strides_in_bytes()[d_batch_idx] / sizeof(TypeOutput); - - int multi_stride_a = a->info()->strides_in_bytes()[a_multi_idx] / sizeof(TypeInput); - int multi_stride_b = 0; - const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] / sizeof(TypeOutput); - - auto in0_ptr = reinterpret_cast(a->buffer() + a->info()->offset_first_element_in_bytes()); - const TypeInput *in1_ptr = nullptr; - auto out_ptr = reinterpret_cast(d->buffer() + d->info()->offset_first_element_in_bytes()); - - // Check if B is pre-tranposed and de-reference if not - if(!_gemm_kernel_asm->B_is_pretransposed()) - { - ldb = b->info()->strides_in_bytes().y() / sizeof(TypeInput); - multi_stride_b = b->info()->strides_in_bytes().z() / sizeof(TypeInput); - in1_ptr = reinterpret_cast(b->buffer() + b->info()->offset_first_element_in_bytes()); - } - - const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.method, d->info()->data_type()); - - // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads - CpuAuxTensorHandler workspace(offset_int_vec(AsmGemmWorkspace), _workspace_info, tensors, false); - if(workspace.get()->buffer() != nullptr) - { - _gemm_kernel_asm->set_working_space(reinterpret_cast(workspace.get()->buffer())); - const unsigned int split_dim = scheduling_hint.split_dimension(); - const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size(); - unsigned int num_threads = NEScheduler::get().num_threads(); - if(window_size < num_threads) - { - num_threads = window_size; - } - if(split_dim != IScheduler::split_dimensions_all) - { - // Make sure the kernel does not expect more threads than we can actually spawn - const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim); - num_threads = std::min(num_iterations, num_threads); - } - _gemm_kernel_asm->set_nthreads(num_threads); - } - - // Prepare assembly kernel - prepare(tensors); - - // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C. - TypeOutput *bias = nullptr; - if(c && c->info()->data_type() != DataType::S32) - { - bias = reinterpret_cast(c->buffer() + c->info()->offset_first_element_in_bytes()); - } - - if(_gemm_info.method == AsmConvMethod::Indirect) - { - in0_ptr = nullptr; - lda = 0; - batch_stride_a = 0; - multi_stride_a = 0; - } - - // Set gemm parameters - _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, - in1_ptr, ldb, multi_stride_b, - out_ptr, ldd, batch_stride_d, multi_stride_d, - bias, 0); - // Schedule - NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint); -} - -template -void create_arm_gemm(std::unique_ptr &arm_gemm, - const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, - arm_gemm::Activation activation, const AsmGemmInfo &info) -{ - Params p = extract_parameters(a, b, d, info); - const CPUInfo &ci = NEScheduler::get().cpu_info(); - unsigned int num_threads = NEScheduler::get().num_threads(); - - arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fast_mode); - - // Create arm_gemm fallback - auto fallback = std::make_unique>(); - fallback->configure(a, b, c, d, args, info); - arm_gemm = std::move(fallback); -} - -template -void create_arm_gemm_quant(std::unique_ptr &arm_gemm, - const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, - arm_gemm::Activation activation, const AsmGemmInfo &info) -{ - ARM_COMPUTE_UNUSED(activation); - Params p = extract_parameters(a, b, d, info); - const CPUInfo &ci = NEScheduler::get().cpu_info(); - const unsigned int num_threads = NEScheduler::get().num_threads(); - - arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fast_mode); - - // Create arm_gemm fallback - auto fallback = std::make_unique>(); - - // Configure requantization info - const int32_t negation = info.negated_offsets ? 1 : -1; - const int32_t a_offset = -a->quantization_info().uniform().offset * negation; - const int32_t b_offset = -b->quantization_info().uniform().offset * negation; - const GEMMLowpOutputStageInfo os_info = info.output_stage; - - arm_gemm::Requantize32 gemm_requant_info{}; - if(os_info.gemmlowp_shifts.size() > 1) - { - const auto requantize_data = fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers); - gemm_requant_info = arm_gemm::Requantize32(nullptr, 0, - a_offset, b_offset, os_info.gemmlowp_offset, - (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr, - std::get<2>(requantize_data), - std::get<3>(requantize_data), - os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound); - } - else - { - gemm_requant_info = arm_gemm::Requantize32(nullptr, 0, - a_offset, b_offset, os_info.gemmlowp_offset, - -os_info.gemmlowp_shift, os_info.gemmlowp_multiplier, - os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound); - } - - // Configure fallback - fallback->configure(a, b, c, d, args, info, gemm_requant_info); - arm_gemm = std::move(fallback); -} -} //namespace - -CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch() - : _arm_gemm(nullptr) -{ -} - -Status CpuGemmAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info) -{ - ARM_COMPUTE_UNUSED(c, info); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a); - ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a); - -#ifndef __aarch64__ - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64"); -#endif /* __aarch64__ */ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, - DataType::BFLOAT16, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8, - DataType::BFLOAT16, DataType::F16, DataType::F32); - if(is_data_type_quantized_per_channel(b->data_type())) - { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8); - } - else - { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); - } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32, "Only F32 output supported for BFLOAT16 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && d->data_type() != DataType::QASYMM8, "Only QASYMM8 output supported for QASYMM8 input"); - return Status{}; -} - -bool CpuGemmAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation) -{ - arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(activation); - return act.type != arm_gemm::Activation::Type::None; -} - -void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); - arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info); - - //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured() - if(!CpuGemmAssemblyDispatch::validate(a, b, c, d, info)) - { - return; - } - - switch(a->data_type()) - { - case DataType::F32: - create_arm_gemm(_arm_gemm, a, b, c, d, act, info); - break; -#ifdef __aarch64__ - case DataType::U8: - case DataType::QASYMM8: - if(d->data_type() == DataType::S32) - { - create_arm_gemm(_arm_gemm, a, b, c, d, act, info); - } - else - { - create_arm_gemm_quant(_arm_gemm, a, b, c, d, act, info); - } - break; - case DataType::S8: - case DataType::QASYMM8_SIGNED: - if(d->data_type() == DataType::S32) - { - create_arm_gemm(_arm_gemm, a, b, c, d, act, info); - } - else - { - create_arm_gemm_quant(_arm_gemm, a, b, c, d, act, info); - } - break; -#endif /* __aarch64__ */ -#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) - case DataType::BFLOAT16: - create_arm_gemm(_arm_gemm, a, b, c, d, act, info); - break; -#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - case DataType::F16: - create_arm_gemm(_arm_gemm, a, b, c, d, act, info); - break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - default: - break; - } -} - -void CpuGemmAssemblyDispatch::prepare(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); - _arm_gemm->prepare(tensors); -} - -bool CpuGemmAssemblyDispatch::is_configured() const -{ - return _arm_gemm != nullptr && _arm_gemm->is_configured(); -} - -void CpuGemmAssemblyDispatch::run(ITensorPack &tensors) -{ - ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); - _arm_gemm->run(tensors); -} - -experimental::MemoryRequirements CpuGemmAssemblyDispatch::workspace() const -{ - ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); - return _arm_gemm->workspace(); -} -} // namespace cpu -} // namespace arm_compute diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h deleted file mode 100644 index 88cfed002a..0000000000 --- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H -#define ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H - -#include "src/core/common/Macros.h" -#include "src/runtime/cpu/ICpuOperator.h" - -namespace arm_compute -{ -namespace cpu -{ -/* Convolution method supported by the assembly gemm interface */ -enum class AsmConvMethod -{ - Im2Col, - Indirect, - Conv -}; - -struct AsmGemmInfo -{ - AsmConvMethod method{ AsmConvMethod::Im2Col }; - PadStrideInfo ps_info{}; - ActivationLayerInfo activation_info{}; - GEMMLowpOutputStageInfo output_stage{}; - bool negated_offsets{ true }; - bool reinterpret_input_as_3d{ false }; - bool depth_output_gemm3d{ false }; - int64_t padding_top{ 0 }; - int64_t padding_left{ 0 }; - float padding_value{ 0.f }; - bool fast_mode{ false }; -}; - -/** Assembly kernel glue */ -class CpuGemmAssemblyDispatch : public ICpuOperator -{ -public: - /** Constructor */ - CpuGemmAssemblyDispatch(); - /** Defautl destructor */ - ~CpuGemmAssemblyDispatch() = default; - - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmAssemblyDispatch); - - class IFallback - { - public: - virtual void run(ITensorPack &tensors) = 0; - virtual void prepare(ITensorPack &tensors) = 0; - virtual experimental::MemoryRequirements workspace() const = 0; - virtual bool is_configured() const = 0; - virtual ~IFallback() = default; - }; - -public: - /** If supported create a Compute Library function else fallback to the arm_gemm function. - * - * @param[in] a Input tensor (Matrix A) - * @param[in] b Input tensor (Matrix B) - * @param[in] c Input tensor (Matrix C) used to pass the bias for quantized calculations - * @param[out] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. - * @param[in] info GEMM meta-data - */ - void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info); - - /** Indicates whether or not this function can be used to process the given parameters. - * - * @param[in] a Input tensor info (Matrix A) - * @param[in] b Input tensor info (Matrix B) - * @param[in] c Input tensor info (Matrix C) used to pass the bias for quantized calculations - * @param[in] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. - * @param[in] info GEMM meta-data - * - * @return a status. - */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info); - /** Checks if activation is supported by the gemm assembly dispatcher - * - * @param[in] activation Activation to check - * - * @return True if activation is supported else false - */ - static bool is_activation_supported(const ActivationLayerInfo &activation); - /** Was the function successfully configured ? - * - * @return True if the function is configured and ready to run - */ - bool is_configured() const; - - // Inherited methods overridden: - void prepare(ITensorPack &tensors) override; - void run(ITensorPack &tensors) override; - experimental::MemoryRequirements workspace() const override; - -private: - std::unique_ptr _arm_gemm; /**< Interface for the arm_gemm fallback */ -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H */ diff --git a/src/runtime/cpu/utils/CpuAuxTensorHandler.h b/src/runtime/cpu/utils/CpuAuxTensorHandler.h deleted file mode 100644 index ae1cffb659..0000000000 --- a/src/runtime/cpu/utils/CpuAuxTensorHandler.h +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H -#define ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H - -#include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/runtime/Tensor.h" - -#include "src/common/utils/Log.h" -#include "support/Cast.h" - -namespace arm_compute -{ -namespace cpu -{ -/* Tensor handler to wrap and handle tensor allocations on workspace buffers */ -class CpuAuxTensorHandler -{ -public: - CpuAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false) - : _tensor() - { - if(info.total_size() == 0) - { - return; - } - _tensor.allocator()->soft_init(info); - - ITensor *packed_tensor = utils::cast::polymorphic_downcast(pack.get_tensor(slot_id)); - if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size())) - { - if(!bypass_alloc) - { - _tensor.allocator()->allocate(); - ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor"); - } - - if(pack_inject) - { - pack.add_tensor(slot_id, &_tensor); - _injected_tensor_pack = &pack; - _injected_slot_id = slot_id; - } - } - else - { - _tensor.allocator()->import_memory(packed_tensor->buffer()); - } - } - - CpuAuxTensorHandler(TensorInfo &info, ITensor &tensor) - : _tensor() - { - _tensor.allocator()->soft_init(info); - if(info.total_size() <= tensor.info()->total_size()) - { - _tensor.allocator()->import_memory(tensor.buffer()); - } - } - - CpuAuxTensorHandler(const CpuAuxTensorHandler &) = delete; - CpuAuxTensorHandler &operator=(const CpuAuxTensorHandler) = delete; - - ~CpuAuxTensorHandler() - { - if(_injected_tensor_pack) - { - _injected_tensor_pack->remove_tensor(_injected_slot_id); - } - } - - ITensor *get() - { - return &_tensor; - } - - ITensor *operator()() - { - return &_tensor; - } - -private: - Tensor _tensor{}; - ITensorPack *_injected_tensor_pack{ nullptr }; - int _injected_slot_id{ TensorType::ACL_UNKNOWN }; -}; -} // namespace cpu -} // namespace arm_compute -#endif /* ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H */ \ No newline at end of file -- cgit v1.2.1