diff options
Diffstat (limited to 'src/cpu/operators')
77 files changed, 12807 insertions, 0 deletions
diff --git a/src/cpu/operators/CpuActivation.cpp b/src/cpu/operators/CpuActivation.cpp new file mode 100644 index 0000000000..44d70cf503 --- /dev/null +++ b/src/cpu/operators/CpuActivation.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuActivation.h" + +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/IOperator.h" +#include "src/common/utils/LegacySupport.h" +#include "src/common/utils/Log.h" +#include "src/cpu/CpuContext.h" +#include "src/cpu/kernels/CpuActivationKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuActivation::configure(const ITensorInfo *input, ITensorInfo *output, const ActivationLayerInfo &activation_info) +{ + ARM_COMPUTE_LOG_PARAMS(input, output, activation_info); + auto k = std::make_unique<kernels::CpuActivationKernel>(); + k->configure(input, output, activation_info); + _kernel = std::move(k); +} + +Status +CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info) +{ + return kernels::CpuActivationKernel::validate(input, output, activation_info); +} + +void CpuActivation::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + auto split_dimension = static_cast<kernels::CpuActivationKernel *>(_kernel.get())->get_split_dimension_hint(); + NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); +} + +std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTensorDescriptor &src, + const AclTensorDescriptor &dst, + const AclActivationDescriptor &act, + bool is_validate) +{ + TensorInfo src_info = detail::convert_to_legacy_tensor_info(src); + TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst); + auto info = detail::convert_to_activation_info(act); + + if (is_validate && + !bool(CpuActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info))) + { + return std::make_tuple(nullptr, StatusCode::UnsupportedConfig); + } + + auto act_op = std::make_unique<cpu::CpuActivation>(); + act_op->configure(&src_info, &dst_info, info); + + auto op = new arm_compute::IOperator(static_cast<IContext *>(this)); + if (op == nullptr) + { + ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources"); + return std::make_tuple(nullptr, StatusCode::OutOfMemory); + } + op->set_internal_operator(std::move(act_op)); + + return std::make_tuple(op, StatusCode::Success); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuActivation.h b/src/cpu/operators/CpuActivation.h new file mode 100644 index 0000000000..ec442f92c8 --- /dev/null +++ b/src/cpu/operators/CpuActivation.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_ACTIVATION_H +#define ARM_COMPUTE_CPU_ACTIVATION_H + +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuActivationKernel */ +class CpuActivation : public ICpuOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @param[in] input Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32. + * @param[out] output Destination tensor info. Data type supported: same as @p src + * @param[in] activation_info Activation layer parameters. + */ + void configure(const ITensorInfo *input, ITensorInfo *output, const ActivationLayerInfo &activation_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuActivation::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_ACTIVATION_H */ diff --git a/src/cpu/operators/CpuAdd.cpp b/src/cpu/operators/CpuAdd.cpp new file mode 100644 index 0000000000..53cd7fa1b7 --- /dev/null +++ b/src/cpu/operators/CpuAdd.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuAdd.h" + +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuAddKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuAdd::configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(act_info); + ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy, act_info); + auto k = std::make_unique<kernels::CpuAddKernel>(); + k->configure(src0, src1, dst, policy); + _kernel = std::move(k); +} + +Status CpuAdd::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); + return kernels::CpuAddKernel::validate(src0, src1, dst, policy); +} + +void CpuAdd::run(ITensorPack &tensors) +{ + const auto split_dimension = static_cast<kernels::CpuAddKernel *>(_kernel.get())->get_split_dimension(); + + NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuAdd.h b/src/cpu/operators/CpuAdd.h new file mode 100644 index 0000000000..5f60102de2 --- /dev/null +++ b/src/cpu/operators/CpuAdd.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_ADD_H +#define ARM_COMPUTE_CPU_ADD_H + +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuAddKernel */ +class CpuAdd : public ICpuOperator +{ +public: + /** Initialise the kernel's input, dst and border mode. + * + * Valid configurations (src0,src1) -> dst : + * + * - (U8,U8) -> U8 + * - (S16,S16) -> S16 + * - (S32,S32) -> S32 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * - (QASYMM8,QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (QSYMM16,QSYMM16) -> QSYMM16 + * + * @param[in] src0 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 + * @param[in] src1 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32 + * @param[out] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. + * @param[in] policy Overflow policy. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. + * + */ + void configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuAdd::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_ADD_H */ diff --git a/src/cpu/operators/CpuAddMulAdd.cpp b/src/cpu/operators/CpuAddMulAdd.cpp new file mode 100644 index 0000000000..2f19f2f842 --- /dev/null +++ b/src/cpu/operators/CpuAddMulAdd.cpp @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuAddMulAdd.h" + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/kernels/CpuAddMulAddKernel.h" +#include "src/cpu/utils/CpuAuxTensorHandler.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuAddMulAdd::configure(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + ITensorInfo *add_output, + ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info); + + auto k = std::make_unique<kernels::CpuAddMulAddKernel>(); + + const DataType data_type = input1->data_type(); + if (is_data_type_quantized(data_type)) + { + _dequantize_bn_mul.configure(bn_mul, &_dequantized_bn_mul); + _dequantize_bn_add.configure(bn_add, &_dequantized_bn_add); + + k->configure(input1, input2, &_dequantized_bn_mul, &_dequantized_bn_add, add_output, final_output, policy, + act_info); + + // Save auxilary memory requirements after configuration + _aux_mem[DequantizedBnMul] = + experimental::MemoryInfo(offset_int_vec(DequantizedBnMul), experimental::MemoryLifetime::Temporary, + _dequantized_bn_mul.total_size()); + _aux_mem[DequantizedBnAdd] = + experimental::MemoryInfo(offset_int_vec(DequantizedBnAdd), experimental::MemoryLifetime::Temporary, + _dequantized_bn_add.total_size()); + } + else + { + k->configure(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info); + } + + _kernel = std::move(k); +} + +Status CpuAddMulAdd::validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + const DataType data_type = input1->data_type(); + if (is_data_type_quantized(data_type)) + { + TensorInfo dequantized_bn_mul = bn_mul->clone()->set_data_type(DataType::F32); + TensorInfo dequantized_bn_add = bn_add->clone()->set_data_type(DataType::F32); + + ARM_COMPUTE_RETURN_ON_ERROR(CpuDequantize::validate(bn_mul, &dequantized_bn_mul)); + ARM_COMPUTE_RETURN_ON_ERROR(CpuDequantize::validate(bn_add, &dequantized_bn_add)); + + return kernels::CpuAddMulAddKernel::validate(input1, input2, &dequantized_bn_mul, &dequantized_bn_add, + add_output, final_output, policy, act_info); + } + else + { + return kernels::CpuAddMulAddKernel::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, + act_info); + } +} + +void CpuAddMulAdd::run(ITensorPack &tensors) +{ + const DataType data_type = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info()->data_type(); + + if (is_data_type_quantized(data_type)) + { + const ITensor *bn_mul = tensors.get_const_tensor(TensorType::ACL_SRC_2); + const ITensor *bn_add = tensors.get_const_tensor(TensorType::ACL_SRC_3); + + CpuAuxTensorHandler dequantized_bn_mul_handler(offset_int_vec(DequantizedBnMul), _dequantized_bn_mul, tensors, + true); + CpuAuxTensorHandler dequantized_bn_add_handler(offset_int_vec(DequantizedBnAdd), _dequantized_bn_add, tensors, + true); + + ITensorPack dequantize_mul_pack = {{TensorType::ACL_SRC_0, bn_mul}, + {TensorType::ACL_DST_0, dequantized_bn_mul_handler.get()}}; + + ITensorPack dequantize_add_pack = {{TensorType::ACL_SRC_0, bn_add}, + {TensorType::ACL_DST_0, dequantized_bn_add_handler.get()}}; + + _dequantize_bn_mul.run(dequantize_mul_pack); + _dequantize_bn_add.run(dequantize_add_pack); + + ITensorPack add_mul_add_pack = { + {TensorType::ACL_SRC_0, tensors.get_const_tensor(TensorType::ACL_SRC_0)}, + {TensorType::ACL_SRC_1, tensors.get_const_tensor(TensorType::ACL_SRC_1)}, + {TensorType::ACL_SRC_2, dequantized_bn_mul_handler.get()}, + {TensorType::ACL_SRC_3, dequantized_bn_add_handler.get()}, + {TensorType::ACL_DST_0, tensors.get_tensor(TensorType::ACL_DST_0)}, + {TensorType::ACL_DST_1, tensors.get_tensor(TensorType::ACL_DST_1)}, + }; + + NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), add_mul_add_pack); + } + else + { + NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); + } +} + +experimental::MemoryRequirements CpuAddMulAdd::workspace() const +{ + return _aux_mem; +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuAddMulAdd.h b/src/cpu/operators/CpuAddMulAdd.h new file mode 100644 index 0000000000..47db75c37e --- /dev/null +++ b/src/cpu/operators/CpuAddMulAdd.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_CPU_OPERATORS_CPUADDMULADD +#define SRC_CPU_OPERATORS_CPUADDMULADD + +#include "arm_compute/core/TensorInfo.h" + +#include "src/cpu/ICpuOperator.h" +#include "src/cpu/operators/CpuDequantize.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuAddMulAddKernel */ +class CpuAddMulAdd : public ICpuOperator +{ +public: + /** Initialize the operator's inputs and outputs. + * + * Similar to @ref NEAddMulAdd::configure() + * + */ + void configure(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + ITensorInfo *add_output, + ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuAddMulAdd::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *bn_mul, + const ITensorInfo *bn_add, + const ITensorInfo *add_output, + const ITensorInfo *final_output, + ConvertPolicy policy, + const ActivationLayerInfo &act_info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + + // We need auxilary memory to dequantize batchnorm coefficients + experimental::MemoryRequirements workspace() const override; + +private: + enum AuxTensorIdx + { + DequantizedBnMul = 0, + DequantizedBnAdd, + Count + }; + + CpuDequantize _dequantize_bn_mul{}; + CpuDequantize _dequantize_bn_add{}; + + TensorInfo _dequantized_bn_mul{}; + TensorInfo _dequantized_bn_add{}; + + experimental::MemoryRequirements _aux_mem{Count}; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* SRC_CPU_OPERATORS_CPUADDMULADD */ diff --git a/src/cpu/operators/CpuCast.cpp b/src/cpu/operators/CpuCast.cpp new file mode 100644 index 0000000000..55b9204d71 --- /dev/null +++ b/src/cpu/operators/CpuCast.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuCast.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuCastKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuCast::configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst, policy); + auto k = std::make_unique<kernels::CpuCastKernel>(); + k->configure(src, dst, policy); + _kernel = std::move(k); +} + +Status CpuCast::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy) +{ + return kernels::CpuCastKernel::validate(src, dst, policy); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuCast.h b/src/cpu/operators/CpuCast.h new file mode 100644 index 0000000000..1f4da6e2a0 --- /dev/null +++ b/src/cpu/operators/CpuCast.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_CPUCAST_H +#define ACL_SRC_CPU_OPERATORS_CPUCAST_H + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuCastKernel */ +class CpuCast : public ICpuOperator +{ +public: + /** Configure operator for a given list of arguments + * + * Input data type must be different than output data type. + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:-----------------------------------------------| + * |QASYMM8_SIGNED | S16, S32, F32, F16 | + * |QASYMM8 | U16, S16, S32, F32, F16 | + * |U8 | U16, S16, S32, F32, F16 | + * |U16 | U8, U32 | + * |S16 | QASYMM8_SIGNED, U8, S32 | + * |F16 | QASYMM8_SIGNED, QASYMM8, F32, S32, U8 | + * |S32 | QASYMM8_SIGNED, QASYMM8, F16, F32, U8 | + * |F32 | QASYMM8_SIGNED, QASYMM8, F16, S32, U8| + * |S64 | F32 | + * + * @param[in] src The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/S64/F16/F32. + * @param[out] dst The destination tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. + * @param[in] policy Conversion policy. + * + * + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuCast::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy); +}; +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_OPERATORS_CPUCAST_H diff --git a/src/cpu/operators/CpuConcatenate.cpp b/src/cpu/operators/CpuConcatenate.cpp new file mode 100644 index 0000000000..5f517a8fcb --- /dev/null +++ b/src/cpu/operators/CpuConcatenate.cpp @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuConcatenate.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/cpu/kernels/CpuConcatenateBatchKernel.h" +#include "src/cpu/kernels/CpuConcatenateDepthKernel.h" +#include "src/cpu/kernels/CpuConcatenateHeightKernel.h" +#include "src/cpu/kernels/CpuConcatenateWidthKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuConcatenate::configure(const std::vector<const ITensorInfo *> &srcs_vector, ITensorInfo *dst, size_t axis) +{ + ARM_COMPUTE_ERROR_ON(dst == nullptr); + ARM_COMPUTE_LOG_PARAMS(srcs_vector, dst, axis); + + _axis = axis; + _num_srcs = srcs_vector.size(); + + TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, dst_shape, 1, srcs_vector[0]->data_type()); + ARM_COMPUTE_ERROR_THROW_ON(CpuConcatenate::validate(srcs_vector, dst, axis)); + + unsigned int offset = 0; + + for (unsigned int i = 0; i < _num_srcs; ++i) + { + switch (axis) + { + case Window::DimX: + { + auto kernel = std::make_unique<kernels::CpuConcatenateWidthKernel>(); + kernel->configure(srcs_vector.at(i), offset, dst); + _concat_kernels.emplace_back(std::move(kernel)); + break; + } + case Window::DimY: + { + auto kernel = std::make_unique<kernels::CpuConcatenateHeightKernel>(); + kernel->configure(srcs_vector.at(i), offset, dst); + _concat_kernels.emplace_back(std::move(kernel)); + break; + } + case Window::DimZ: + { + auto kernel = std::make_unique<kernels::CpuConcatenateDepthKernel>(); + kernel->configure(srcs_vector.at(i), offset, dst); + _concat_kernels.emplace_back(std::move(kernel)); + break; + } + case 3: + { + auto kernel = std::make_unique<kernels::CpuConcatenateBatchKernel>(); + kernel->configure(srcs_vector.at(i), offset, dst); + _concat_kernels.emplace_back(std::move(kernel)); + break; + } + default: + ARM_COMPUTE_ERROR("Axis not supported"); + } + offset += srcs_vector.at(i)->dimension(axis); + } +} + +Status +CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); + ARM_COMPUTE_RETURN_ERROR_ON(srcs_vector.size() < 2); + + unsigned int offset = 0; + for (const auto &src : srcs_vector) + { + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + switch (axis) + { + case Window::DimX: + { + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateWidthKernel::validate(src, offset, dst)); + break; + } + case Window::DimY: + { + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateHeightKernel::validate(src, offset, dst)); + break; + } + case Window::DimZ: + { + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateDepthKernel::validate(src, offset, dst)); + break; + } + case 3: + { + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConcatenateBatchKernel::validate(src, offset, dst)); + break; + } + default: + ARM_COMPUTE_ERROR("Axis not supported"); + } + offset += src->dimension(axis); + } + + if (dst->total_size() != 0) + { + TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis); + ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size()); + } + + return Status{}; +} + +void CpuConcatenate::run(ITensorPack &tensors) +{ + if (tensors.empty()) + { + ARM_COMPUTE_ERROR("No inputs provided"); + } + + if (static_cast<int>(tensors.size() - 1) != static_cast<int>(_num_srcs)) + { + ARM_COMPUTE_ERROR("Configured with different number of inputs"); + } + + int i = 0; + for (auto &k : _concat_kernels) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i)); + pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST)); + NEScheduler::get().schedule_op(k.get(), Window::DimY, k->window(), pack); + ++i; + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuConcatenate.h b/src/cpu/operators/CpuConcatenate.h new file mode 100644 index 0000000000..c36977c70f --- /dev/null +++ b/src/cpu/operators/CpuConcatenate.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_CONCATENATE_H +#define ARM_COMPUTE_CPU_CONCATENATE_H + +#include "src/cpu/ICpuKernel.h" +#include "src/cpu/ICpuOperator.h" + +#include <vector> + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels: + * + * -# @ref kernels::CpuConcatenateWidthKernel (if underlying concatenation axis is 0). + * -# @ref kernels::CpuConcatenateHeightKernel (if underlying concatenation axis is 1). + * -# @ref kernels::CpuConcatenateDepthKernel (if underlying concatenation axis is 2). + * -# @ref kernels::CpuConcatenateBatchKernel (if underlying concatenation axis is 3). + */ +class CpuConcatenate : public ICpuOperator +{ +public: + CpuConcatenate() = default; + /** Configure operator for a given list of arguments + * + * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis. + * @note Preconditions can be found respectively at @ref kernels::CpuConcatenateWidthKernel, @ref kernels::CpuConcatenateHeightKernel, + * @ref kernels::CpuConcatenateDepthKernel and @ref kernels::CpuConcatenateBatchKernel. + * + * @param[in,out] srcs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] dst Output tensor. Data types supported: Same as @p srcs_vector. + * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3. + */ + void configure(const std::vector<const ITensorInfo *> &srcs_vector, ITensorInfo *dst, size_t axis); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuConcatenate::configure() + * + * @return a status + */ + static Status validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + +private: + std::vector<std::unique_ptr<ICPPKernel>> _concat_kernels{}; + unsigned int _num_srcs{0}; + unsigned int _axis{0}; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_CONCATENATE_H */ diff --git a/src/cpu/operators/CpuConv2d.cpp b/src/cpu/operators/CpuConv2d.cpp new file mode 100644 index 0000000000..26ca2ee783 --- /dev/null +++ b/src/cpu/operators/CpuConv2d.cpp @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2017-2021, 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuConv2d.h" + +#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/operators/CpuDirectConv2d.h" +#include "src/cpu/operators/CpuGemm.h" +#include "src/cpu/operators/CpuGemmConv2d.h" +#include "src/cpu/operators/CpuGemmDirectConv2d.h" +#include "src/cpu/operators/CpuWinogradConv2d.h" + +namespace arm_compute +{ +namespace cpu +{ +CpuConv2d::CpuConv2d() : _function() +{ +} + +CpuConv2d::~CpuConv2d() = default; + +void CpuConv2d::configure(ITensorInfo *input, + ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_UNUSED(num_groups); + ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, + act_info, enable_fast_math, num_groups)); + + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, + enable_fast_math, num_groups); + + const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); + switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, + enable_fast_math)) + { + case ConvolutionMethod::WINOGRAD: + { + auto f = std::make_unique<CpuWinogradConv2d>(); + f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math); + _function = std::move(f); + break; + } + case ConvolutionMethod::GEMM: + { + auto f = std::make_unique<CpuGemmConv2d>(); + f->configure(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math); + _function = std::move(f); + break; + } + case ConvolutionMethod::GEMM_CONV2D: + { + auto f = std::make_unique<CpuGemmDirectConv2d>(); + f->configure(input, weights, biases, output, info); + _function = std::move(f); + break; + } + case ConvolutionMethod::DIRECT: + { + auto f = std::make_unique<CpuDirectConv2d>(); + f->configure(input, weights, biases, output, conv_info, act_info); + _function = std::move(f); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported."); + break; + } + + _aux_mem = _function->workspace(); +} + +Status CpuConv2d::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) +{ + ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on Neon"); + + const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups); + switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, + enable_fast_math)) + { + case ConvolutionMethod::WINOGRAD: + ARM_COMPUTE_RETURN_ON_ERROR( + CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math)); + break; + case ConvolutionMethod::GEMM: + ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, + dilation, act_info, enable_fast_math)); + break; + case ConvolutionMethod::GEMM_CONV2D: + ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmDirectConv2d::validate(input, weights, biases, output, info)); + break; + case ConvolutionMethod::DIRECT: + ARM_COMPUTE_RETURN_ON_ERROR(CpuDirectConv2d::validate(input, weights, biases, output, conv_info, act_info)); + break; + default: + ARM_COMPUTE_ERROR("Not supported."); + break; + } + + return Status{}; +} + +ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights); + ARM_COMPUTE_UNUSED(weights_info); + + const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); + const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL); + + const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, 1); + + /* Input spatial dims, kernel size, IFM/OFM, conv info*/ + using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo>; + using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>; + + const std::vector<ConfigurationMethod> known_configs = { + // Alexnet + ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), + PadStrideInfo(1U, 1U, 2U, 2U)), + ConvolutionMethod::GEMM), + // VGG16 / VGG19 + ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), + PadStrideInfo(1U, 1U, 1U, 1U)), + ConvolutionMethod::GEMM), + // Mobilenet 224 + ConfigurationMethod( + ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), + ConvolutionMethod::GEMM), + // Mobilenet 160 + ConfigurationMethod( + ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), + ConvolutionMethod::GEMM)}; + + const auto find_config = [&](ConfigurationMethod c) + { + const ConvolutionConfiguration config = c.first; + const PadStrideInfo info = std::get<3>(config); + + return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && + std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) && + std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && + info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() && + info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && + info.stride() == conv_info.stride(); + }; + + std::vector<ConfigurationMethod>::const_iterator found; + if ((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) + { + return (*found).second; + } + + if (dilation != Size2D(1U, 1U)) + { + return ConvolutionMethod::GEMM; + } + else + { + const bool gemmDirectConv2d_validates = + bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info)); + + // SRGAN + // Output might not be initialized when it is an internal tensor of the layer using the convolution + if (input->total_size() > 1e7 && weights->dimension(idx_h) > 7) + { + // This configuration is memory demanding for GEMM method. GEMM_CONV2D which uses indirect convolution + // kernels underneath is the best option. + if (gemmDirectConv2d_validates) + { + return ConvolutionMethod::GEMM_CONV2D; + } + else if (bool(CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info))) + { + // NCHW data layout is not supported by GEMM_CONV2D + return ConvolutionMethod::DIRECT; + } + } + if (input->dimension(idx_c) < 16) + { + return ConvolutionMethod::GEMM; + } + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + // This heuristics only applies to F16 data type on A55r1 + if (NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math && + input->data_type() == DataType::F16) + { + // Exclude known bad winograd configs (and defaults to GEMM) + const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs = { + // Squeezenet_V1_1 fire2 and fire3 + ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U), + PadStrideInfo(1U, 1U, 1U, 1U)), + // Squeezenet_V1_1 fire6 and fire7 + ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U), + PadStrideInfo(1U, 1U, 1U, 1U)), + // Squeezenet_V1_1 fire8 and fire9 + ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U), + PadStrideInfo(1U, 1U, 1U, 1U)), + }; + const auto find_conv_config = [&](ConvolutionConfiguration c) + { + const PadStrideInfo info = std::get<3>(c); + + return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && + std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) && + std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && + info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() && + info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && + info.stride() == conv_info.stride(); + }; + + bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(), + known_bad_winograd_f16_with_fastmath_configs.end(), + find_conv_config) != known_bad_winograd_f16_with_fastmath_configs.end(); + if (found_bad) + { + return ConvolutionMethod::GEMM; + } + } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + + // For 1x1 convolutions run the default GEMM + if (weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1) + { + return ConvolutionMethod::GEMM; + } + + if (bool(CpuWinogradConv2d::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math))) + { + return ConvolutionMethod::WINOGRAD; + } + if (gemmDirectConv2d_validates) + { + return ConvolutionMethod::GEMM_CONV2D; + } + return ConvolutionMethod::GEMM; + } +} + +void CpuConv2d::run(ITensorPack &tensors) +{ + prepare(tensors); + _function->run(tensors); +} + +void CpuConv2d::prepare(ITensorPack &tensors) +{ + _function->prepare(tensors); +} + +experimental::MemoryRequirements CpuConv2d::workspace() const +{ + return _aux_mem; +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuConv2d.h b/src/cpu/operators/CpuConv2d.h new file mode 100644 index 0000000000..71b9e15dc1 --- /dev/null +++ b/src/cpu/operators/CpuConv2d.h @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2017-2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to simulate a convolution layer. This function calls one of the following functions: + * -# @ref CpuGemm (executed only in case GEMM is required for the operation) + * -# @ref CpuWinogradConv2d (executed only in case Winograd is required for the operation) + * -# @ref CpuDirectConv2d (executed only in case Direct Convolution is required for the operation) + * + * + * The function selects one of the algorithms mentioned above based on: + * - The size of the kernel + * - Number of input/output feature maps + * - Amount of memory needed + * + * Generally GEMM-based convolution is executed when neither Winograd nor FFT nor Direct convolution can be performed. + * + * FP32 Algorithm| Filter Size | Input/Output feature maps | + * --------------|----------------------------------------------------|-------------------------------------------| + * Winograd | 3x3 1x3 3x1 5x1 1x5 5x5(fast maths) 7x1 1x7 | Input channels is greater than 3 | + * FFT | Squared kernels and greater than 9x9 | Input feature maps > Output feature maps | + * DirectConv | 9x9 | | + * GEMM | Any size | | + * + * Winograd 5x5 requires fast maths enabled. + * + * FP16 Algorithm| Filter Size | + * --------------|------------------| + * Winograd | Not supported | + * FFT | Not supported | + * DirectConv | 9x9 | + * GEMM | Any size | + * + * + */ +class CpuConv2d : public ICpuOperator +{ +public: + /** Constructor */ + CpuConv2d(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuConv2d); + /** Default destructor */ + ~CpuConv2d(); + /** Set the input and output tensors. + * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | + * + * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: Same as @p src, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Same as @p src, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * @param[out] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p src. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported + */ + void configure(ITensorInfo *src, + ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); + /** Static function to check if given info will lead to a valid configuration of @ref CpuConv2d + * + * Similar to CpuConv2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); + /** Static function to check if given info will return the convolution called by @ref CpuConv2d + * + * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported:Same as @p src, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED. + * @param[in] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p src. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + * + * @return the Convolution Method Hint + */ + static ConvolutionMethod get_convolution_method(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; + experimental::MemoryRequirements workspace() const override; + +private: + std::unique_ptr<ICpuOperator> _function; + experimental::MemoryRequirements _aux_mem{}; +}; +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp new file mode 100644 index 0000000000..49e31926e3 --- /dev/null +++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuConvertFullyConnectedWeights.h" + +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout); + auto k = std::make_unique<kernels::CpuConvertFullyConnectedWeightsKernel>(); + k->configure(src, dst, original_src_shape, data_layout); + _kernel = std::move(k); +} + +Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout) +{ + return kernels::CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout); +} + +void CpuConvertFullyConnectedWeights::run(ITensorPack &tensors) +{ + NEScheduler::get().schedule_op(_kernel.get(), Window::DimZ, _kernel->window(), tensors); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.h b/src/cpu/operators/CpuConvertFullyConnectedWeights.h new file mode 100644 index 0000000000..e208cca3a0 --- /dev/null +++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_H +#define ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_H + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuConvertFullyConnectedWeightsKernel */ +class CpuConvertFullyConnectedWeights : public ICpuOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @param[in] src Source tensor to permute. Data types supported: All + * @param[out] dst Destintation tensor. Data types supported: Same as @p src + * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer). + * @param[in] data_layout The data layout the weights have been trained in. + */ + void + configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuConvertFullyConnectedWeights::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout); + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_H */ diff --git a/src/cpu/operators/CpuCopy.cpp b/src/cpu/operators/CpuCopy.cpp new file mode 100644 index 0000000000..92c19d4df2 --- /dev/null +++ b/src/cpu/operators/CpuCopy.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuCopy.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuCopyKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuCopy::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::CpuCopyKernel>(); + k->configure(src, dst); + _kernel = std::move(k); +} + +Status CpuCopy::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::CpuCopyKernel::validate(src, dst); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuCopy.h b/src/cpu/operators/CpuCopy.h new file mode 100644 index 0000000000..9ffde4e781 --- /dev/null +++ b/src/cpu/operators/CpuCopy.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_COPY_H +#define ARM_COMPUTE_CPU_COPY_H + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuCopyKernel */ +class CpuCopy : public ICpuOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @param[in] src Source tensor info. Data type supported: All + * @param[out] dst Destination info. Data type supported: Same as @p src + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuCopy::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_COPY_H */ diff --git a/src/cpu/operators/CpuDepthwiseConv2d.cpp b/src/cpu/operators/CpuDepthwiseConv2d.cpp new file mode 100644 index 0000000000..54075f2afa --- /dev/null +++ b/src/cpu/operators/CpuDepthwiseConv2d.cpp @@ -0,0 +1,568 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuDepthwiseConv2d.h" + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/InfoHelpers.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +Status validate_arguments_optimized(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + if (!is_data_type_quantized_per_channel(weights->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); + } + ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1); + const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > + src->dimension(idx_w) + info.pad_stride_info.pad_left() + + info.pad_stride_info.pad_right()); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > + src->dimension(idx_h) + info.pad_stride_info.pad_top() + + info.pad_stride_info.pad_bottom()); + + if (biases != nullptr) + { + const unsigned int channel_idx = + get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx)); + } + + ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, biases, dst, info)); + + // Validate Activation Layer + if (info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info)) + { + ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info)); + } + return Status{}; +} +} // namespace + +void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON( + CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases, dst, info)); + + _is_quantized = is_data_type_quantized_asymmetric(src->data_type()); + _has_bias = biases != nullptr; + _is_nchw = src->data_layout() == DataLayout::NCHW; + _permute = _is_nchw; + _is_prepared = false; + _are_weights_const = weights->are_values_constant(); + + // Configure pipeline + _is_activationlayer_enabled = + info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info); + + _dwc_optimized_func = std::make_unique<CpuDepthwiseConv2dAssemblyDispatch>(); + if (_is_nchw) + { + _permute_input = std::make_unique<cpu::CpuPermute>(); + _permute_weights = std::make_unique<cpu::CpuPermute>(); + _permute_output = std::make_unique<cpu::CpuPermute>(); + + auto input_perm = std::make_unique<TensorInfo>(); + auto weights_perm = std::make_unique<TensorInfo>(); + auto output_perm = std::make_unique<TensorInfo>(); + + // Configure the function to transform the input tensor from NCHW -> NHWC + _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U)); + input_perm->set_data_layout(DataLayout::NHWC); + + // Configure the function to transform the weights tensor from IHW -> HWI + _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U)); + weights_perm->set_data_layout(DataLayout::NHWC); + + output_perm->set_data_layout(DataLayout::NHWC); + output_perm->set_quantization_info(dst->quantization_info()); + + // Configure optimized depthwise + _dwc_optimized_func->configure(input_perm.get(), weights_perm.get(), biases, output_perm.get(), info); + + // Configure the function to transform the convoluted output to ACL's native ordering format NCHW + output_perm->set_data_layout(DataLayout::NHWC); + _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U)); + } + else + { + _dwc_optimized_func->configure(src, weights, biases, dst, info); + } + + // Configure activation + if (_is_activationlayer_enabled) + { + _activationlayer_function = std::make_unique<cpu::CpuActivation>(); + _activationlayer_function->configure(dst, nullptr, info.act_info); + } +} + +Status CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) +{ + return validate_arguments_optimized(src, weights, biases, dst, info); +} + +void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + prepare(tensors); + + auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto dst = tensors.get_tensor(TensorType::ACL_DST_0); + auto workspace = tensors.get_tensor(TensorType::ACL_INT_3); + auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4); + + // Permute input + if (_permute) + { + ITensorPack pack; + auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); + pack.add_tensor(TensorType::ACL_SRC, src); + pack.add_tensor(TensorType::ACL_DST, src_perm); + _permute_input->run(pack); + } + + // Run assembly function + if (_is_nchw) + { + auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); + auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); + auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); + + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, src_perm); + pack.add_tensor(TensorType::ACL_SRC_1, weights_perm); + pack.add_tensor(TensorType::ACL_SRC_2, bias); + pack.add_tensor(TensorType::ACL_INT_0, workspace); + pack.add_tensor(TensorType::ACL_INT_1, packed_weights); + pack.add_tensor(TensorType::ACL_DST, dst_perm); + _dwc_optimized_func->run(pack); + } + else + { + auto src = tensors.get_tensor(TensorType::ACL_SRC_0); + auto weights = tensors.get_tensor(TensorType::ACL_SRC_1); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, src); + pack.add_tensor(TensorType::ACL_SRC_1, weights); + pack.add_tensor(TensorType::ACL_SRC_2, bias); + pack.add_tensor(TensorType::ACL_INT_0, workspace); + pack.add_tensor(TensorType::ACL_INT_1, packed_weights); + pack.add_tensor(TensorType::ACL_DST, dst); + _dwc_optimized_func->run(pack); + } + + // Permute output + if (_is_nchw) + { + ITensorPack pack; + auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); + pack.add_tensor(TensorType::ACL_SRC, dst_perm); + pack.add_tensor(TensorType::ACL_DST, dst); + _permute_output->run(pack); + } + + // Run activation + if (_is_activationlayer_enabled) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, dst); + pack.add_tensor(TensorType::ACL_DST, dst); + _activationlayer_function->run(pack); + } +} + +void CpuDepthwiseConv2d::CpuDepthwiseConv2dOptimizedInternal::prepare(ITensorPack &tensors) +{ + // if weights are not constant then we need to repack so that weights + // can be updated in-place + if (!_are_weights_const) + { + auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4); + + ITensorPack pack_opt; + pack_opt.add_tensor(TensorType::ACL_SRC_1, weights); + pack_opt.add_tensor(TensorType::ACL_SRC_2, bias); + pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights); + + // Prepare optimized function + _dwc_optimized_func->prepare(pack_opt); + + return; + } + + if (!_is_prepared) + { + auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4); + + // Permute weights + if (_permute) + { + auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1); + + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, weights); + pack.add_tensor(TensorType::ACL_DST, permuted_weights); + _permute_weights->run(pack); + + weights->mark_as_unused(); + + ITensorPack pack_opt; + pack_opt.add_const_tensor(TensorType::ACL_SRC_1, permuted_weights); + pack_opt.add_tensor(TensorType::ACL_SRC_2, bias); + pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights); + + // Prepare optimized function + _dwc_optimized_func->prepare(pack_opt); + } + else + { + ITensorPack pack_opt; + pack_opt.add_tensor(TensorType::ACL_SRC_1, weights); + pack_opt.add_tensor(TensorType::ACL_SRC_2, bias); + pack_opt.add_tensor(TensorType::ACL_INT_1, packed_weights); + + // Prepare optimized function + _dwc_optimized_func->prepare(pack_opt); + } + + _is_prepared = true; + } +} + +void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_ERROR_THROW_ON( + CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases, dst, info)); + + _is_nchw = src->data_layout() == DataLayout::NCHW; + _is_prepared = !_is_nchw; + + ITensorInfo *input_to_use = src; + const ITensorInfo *weights_to_use = weights; + ITensorInfo *output_to_use = dst; + + auto input_perm = std::make_unique<TensorInfo>(); + auto weights_perm = std::make_unique<TensorInfo>(); + auto output_perm = std::make_unique<TensorInfo>( + dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape())); + + if (_is_nchw) + { + _permute_input = std::make_unique<cpu::CpuPermute>(); + _permute_weights = std::make_unique<cpu::CpuPermute>(); + + _permute_input->configure(src, input_perm.get(), PermutationVector(2U, 0U, 1U)); + input_perm->set_data_layout(DataLayout::NHWC); + input_to_use = input_perm.get(); + + _permute_weights->configure(weights, weights_perm.get(), PermutationVector(2U, 0U, 1U)); + weights_perm->set_data_layout(DataLayout::NHWC); + weights_to_use = weights_perm.get(); + + output_to_use = output_perm.get(); + } + + _depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>(); + _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info); + + if (_is_nchw) + { + _permute_output = std::make_unique<cpu::CpuPermute>(); + _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U)); + output_perm->set_data_layout(DataLayout::NHWC); + } + + //Configure Activation Layer + _is_activationlayer_enabled = info.act_info.enabled(); + if (_is_activationlayer_enabled) + { + _activationlayer_function = std::make_unique<cpu::CpuActivation>(); + _activationlayer_function->configure(dst, nullptr, info.act_info); + } +} + +Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + if (src->data_layout() == DataLayout::NCHW) + { + TensorShape permuted_input_shape = src->tensor_shape(); + TensorShape permuted_weights_shape = weights->tensor_shape(); + TensorShape permuted_output_shape = + misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info); + permute(permuted_input_shape, PermutationVector(2U, 0U, 1U)); + permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U)); + permute(permuted_output_shape, PermutationVector(2U, 0U, 1U)); + + const TensorInfo permuted_input = TensorInfo(src->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_input_shape) + .set_data_layout(DataLayout::NHWC)); + const TensorInfo permuted_weights = TensorInfo(weights->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_weights_shape) + .set_data_layout(DataLayout::NHWC)); + const TensorInfo permuted_output = TensorInfo(dst->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(permuted_output_shape) + .set_data_layout(DataLayout::NCHW)); + + ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &permuted_input, PermutationVector(2U, 0U, 1U))); + ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U))); + ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, dst, PermutationVector(1U, 2U, 0U))); + + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate( + &permuted_input, &permuted_weights, biases, &permuted_output, info)); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR( + cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info)); + } + + // Validate Activation Layer + if (info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info)) + { + ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info)); + } + + return Status{}; +} + +void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::run(ITensorPack &tensors) +{ + auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto dst = tensors.get_tensor(TensorType::ACL_DST_0); + + if (_is_nchw) + { + prepare(tensors); + auto src_perm = tensors.get_tensor(TensorType::ACL_INT_0); + auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); + auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); + + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, src); + pack.add_tensor(TensorType::ACL_DST, src_perm); + _permute_input->run(pack); + + ITensorPack pack_depth; + pack_depth.add_const_tensor(TensorType::ACL_SRC_0, src_perm); + pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm); + pack_depth.add_tensor(TensorType::ACL_SRC_2, biases); + pack_depth.add_tensor(TensorType::ACL_DST, dst_perm); + NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), + pack_depth); + } + else + { + ITensorPack pack_depth; + pack_depth.add_tensor(TensorType::ACL_SRC_0, src); + pack_depth.add_tensor(TensorType::ACL_SRC_1, weights); + pack_depth.add_tensor(TensorType::ACL_SRC_2, biases); + pack_depth.add_tensor(TensorType::ACL_DST, dst); + NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), + pack_depth); + } + + if (_is_nchw) + { + ITensorPack pack; + auto dst_perm = tensors.get_tensor(TensorType::ACL_INT_2); + pack.add_tensor(TensorType::ACL_SRC, dst_perm); + pack.add_tensor(TensorType::ACL_DST, dst); + _permute_output->run(pack); + } + + if (_is_activationlayer_enabled) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, dst); + pack.add_tensor(TensorType::ACL_DST, dst); + _activationlayer_function->run(pack); + } +} + +void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors) +{ + if (!_is_prepared) + { + auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1); + + ARM_COMPUTE_ERROR_ON(!weights->is_used()); + + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, weights); + pack.add_tensor(TensorType::ACL_DST, weights_perm); + + _permute_weights->run(pack); + weights->mark_as_unused(); + _is_prepared = true; + } +} + +void CpuDepthwiseConv2d::configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info) +{ + ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info); + + _depth_conv_func = + get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info); + switch (_depth_conv_func) + { + case DepthwiseConvolutionFunction::OPTIMIZED: + _func_optimized.configure(src, weights, biases, dst, info); + break; + case DepthwiseConvolutionFunction::GENERIC: + _func_generic.configure(src, weights, biases, dst, info); + break; + default: + ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction"); + } +} + +Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) +{ + DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(src, weights, biases, dst, info); + switch (depth_conv_func) + { + case DepthwiseConvolutionFunction::OPTIMIZED: + return CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info); + break; + case DepthwiseConvolutionFunction::GENERIC: + return CpuDepthwiseConv2dGeneric::validate(src, weights, biases, dst, info); + break; + default: + ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction"); + } +} + +DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info) +{ + if (bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info))) + { + return DepthwiseConvolutionFunction::OPTIMIZED; + } + else + { + return DepthwiseConvolutionFunction::GENERIC; + } +} + +void CpuDepthwiseConv2d::run(ITensorPack &tensors) +{ + switch (_depth_conv_func) + { + case DepthwiseConvolutionFunction::OPTIMIZED: + _func_optimized.run(tensors); + break; + case DepthwiseConvolutionFunction::GENERIC: + _func_generic.run(tensors); + break; + default: + ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured"); + } +} + +void CpuDepthwiseConv2d::prepare(ITensorPack &tensors) +{ + switch (_depth_conv_func) + { + case DepthwiseConvolutionFunction::OPTIMIZED: + _func_optimized.prepare(tensors); + break; + case DepthwiseConvolutionFunction::GENERIC: + _func_generic.prepare(tensors); + break; + default: + ARM_COMPUTE_ERROR("DepthwiseConvolutionFunction not properly configured"); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuDepthwiseConv2d.h b/src/cpu/operators/CpuDepthwiseConv2d.h new file mode 100644 index 0000000000..7eaa0df857 --- /dev/null +++ b/src/cpu/operators/CpuDepthwiseConv2d.h @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H +#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/ITensorInfo.h" + +#include "src/cpu/ICpuKernel.h" +#include "src/cpu/ICpuOperator.h" +#include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h" +#include "src/cpu/operators/CpuActivation.h" +#include "src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h" +#include "src/cpu/operators/CpuPermute.h" + +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +/** Function to execute a depthwise convolution. + */ +class CpuDepthwiseConv2d : public ICpuOperator +{ +public: + /** Default constructor */ + CpuDepthwiseConv2d() = default; + /** Initialize the function's source, destination, weights and convolution information. + * + * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32 + * @param[out] dst Destination tensor info. Data type supported: same as @p src. + * @param[in] weights Weights tensor info. These are 3D tensor infos with shape [kernel_x, kernel_y, IFM]. + * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. + * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. + * @param[in] info Depthwise convolution meta-data. + */ + void configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDepthwiseConv2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info); + /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConv2d + * + * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32 + * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. + * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. + * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. + * @param[in] dst Destination tensor. Data type supported: same as @p src. + * @param[in] info Depthwise convolution meta-data. + * + * @return a Depthwise Convolution Function + */ + static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info); + + // Inherited methods overriden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + +private: + /** Basic function to execute optimized depthwise convolution routines. This function calls the following kernels: + * + * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported + * + * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present + * -# @ref CpuDepthwiseConv2d3x3Kernel if 3x3 and no assembly kernel implementation is present + * -# @ref CpuDepthwiseConv2dAssemblyDispatch if assembly kernel implementation is present + * -# @ref CpuActivation if fused activation is required + * + */ + class CpuDepthwiseConv2dOptimizedInternal : public ICpuOperator + { + public: + /** Default constructor */ + CpuDepthwiseConv2dOptimizedInternal() = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuDepthwiseConv2dOptimizedInternal(const CpuDepthwiseConv2dOptimizedInternal &) = delete; + /** Default move constructor */ + CpuDepthwiseConv2dOptimizedInternal(CpuDepthwiseConv2dOptimizedInternal &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuDepthwiseConv2dOptimizedInternal &operator=(const CpuDepthwiseConv2dOptimizedInternal &) = delete; + /** Default move assignment operator */ + CpuDepthwiseConv2dOptimizedInternal &operator=(CpuDepthwiseConv2dOptimizedInternal &&) = default; + /** Default destructor */ + ~CpuDepthwiseConv2dOptimizedInternal() = default; + /** Initialize the function's source, destination, kernels and border_size. + * + * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling). + * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p src. + * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. + * @param[out] dst Destination tensor info. Data type supported: same as @p src. + * @param[in] info Depthwise convolution meta-data. + */ + void configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDepthwiseConv2dOptimizedInternal::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info); + + // Inherited methods overriden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + + private: + std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{nullptr}; + std::unique_ptr<CpuPermute> _permute_input{nullptr}; + std::unique_ptr<CpuPermute> _permute_weights{nullptr}; + std::unique_ptr<CpuPermute> _permute_output{nullptr}; + std::unique_ptr<CpuActivation> _activationlayer_function{nullptr}; + bool _has_bias{false}; + bool _is_quantized{false}; + bool _is_nchw{true}; + bool _permute{false}; + bool _is_activationlayer_enabled{false}; + bool _is_prepared{false}; + bool _are_weights_const{true}; + }; + + /** Basic function to execute a generic depthwise convolution. This function calls the following kernel: + * + * -# @ref CpuDepthwiseConv2dNativeKernel + * + */ + class CpuDepthwiseConv2dGeneric : public ICpuOperator + { + public: + /** Default constructor */ + CpuDepthwiseConv2dGeneric() = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuDepthwiseConv2dGeneric(const CpuDepthwiseConv2dGeneric &) = delete; + /** Default move constructor */ + CpuDepthwiseConv2dGeneric(CpuDepthwiseConv2dGeneric &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuDepthwiseConv2dGeneric &operator=(const CpuDepthwiseConv2dGeneric &) = delete; + /** Default move assignment operator */ + CpuDepthwiseConv2dGeneric &operator=(CpuDepthwiseConv2dGeneric &&) = default; + /** Default destructor */ + ~CpuDepthwiseConv2dGeneric() = default; + /** Initialize the function's source, destination, weights and convolution information. + * + * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling). + * @param[out] dst Destination tensor info. Data type supported: same as @p src. + * @param[in] weights Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. + * Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. + * @param[in] biases Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED. + * @param[in] info Depthwise convolution meta-data. + */ + void configure(ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ConvolutionInfo &info); + + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDepthwiseConv2dGeneric::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ConvolutionInfo &info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + + private: + std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{nullptr}; + std::unique_ptr<CpuPermute> _permute_input{nullptr}; + std::unique_ptr<CpuPermute> _permute_weights{nullptr}; + std::unique_ptr<CpuPermute> _permute_output{nullptr}; + std::unique_ptr<CpuActivation> _activationlayer_function{nullptr}; + bool _is_nchw{true}; + bool _is_prepared{false}; + bool _is_activationlayer_enabled{false}; + }; + + DepthwiseConvolutionFunction _depth_conv_func{DepthwiseConvolutionFunction::GENERIC}; + CpuDepthwiseConv2dOptimizedInternal _func_optimized{}; + CpuDepthwiseConv2dGeneric _func_generic{}; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H */ diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp new file mode 100644 index 0000000000..7fe9011da1 --- /dev/null +++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2019-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h" + +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/utils/AssemblyUtils.h" +#include "src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl +{ + std::unique_ptr<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel> asm_kernel{nullptr}; + bool is_prepared{false}; + bool are_weights_const{true}; + experimental::MemoryRequirements mem_req{}; +}; + +#ifndef DOXYGEN_SKIP_THIS +CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch() : _pImpl(std::make_unique<LocalImpl>()) +{ +} +#endif /* DOXYGEN_SKIP_THIS */ + +CpuDepthwiseConv2dAssemblyDispatch::~CpuDepthwiseConv2dAssemblyDispatch() = default; + +void CpuDepthwiseConv2dAssemblyDispatch::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *dst, + const ConvolutionInfo &info) +{ + ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, info); + const CPUInfo &ci = NEScheduler::get().cpu_info(); + const unsigned int num_threads = NEScheduler::get().num_threads(); + _pImpl->is_prepared = false; + _pImpl->are_weights_const = weights->are_values_constant(); + + // If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured() + if (!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info)) + { + return; + } + + auto dwc_wrapper = std::make_unique<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel>(); + ARM_COMPUTE_ERROR_ON(dwc_wrapper == nullptr); + dwc_wrapper->configure(src, weights, bias, dst, info, ci); + + // Compute memory requirements for assembly kernels + constexpr size_t alignment = 4096; + _pImpl->mem_req.push_back({TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads), alignment}); + _pImpl->mem_req.push_back({TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment}); + _pImpl->asm_kernel = std::move(dwc_wrapper); +} + +Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const ConvolutionInfo &info) +{ + return kernels::CpuDepthwiseConv2dAssemblyWrapperKernel::validate(src, weights, bias, dst, info); +} + +experimental::MemoryRequirements CpuDepthwiseConv2dAssemblyDispatch::workspace() const +{ + return _pImpl->mem_req; +} + +bool CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation) +{ + arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(activation); + return act.type != arm_gemm::Activation::Type::None; +} + +void CpuDepthwiseConv2dAssemblyDispatch::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + + prepare(tensors); + + // Split over rows (z) if there's more than 1, otherwise batches (w). This logic + // corresponds to the threading strategy in DepthFirstDriver::execute_internal + auto split_dimension = _pImpl->asm_kernel->window().num_iterations(Window::DimZ) != 1 ? Window::DimZ : Window::DimW; + + NEScheduler::get().schedule_op(_pImpl->asm_kernel.get(), split_dimension, _pImpl->asm_kernel->window(), tensors); +} + +void CpuDepthwiseConv2dAssemblyDispatch::prepare(ITensorPack &tensors) +{ + const ITensor *weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + + if ((!_pImpl->are_weights_const && weights != nullptr) || !_pImpl->is_prepared) + { + // Pack weights and bias + const ITensor *bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); + ITensor *storage = tensors.get_tensor(TensorType::ACL_INT_1); + + const auto weights_ptr = weights->buffer() + weights->info()->offset_first_element_in_bytes(); + const auto bias_ptr = (bias) ? bias->buffer() + bias->info()->offset_first_element_in_bytes() : nullptr; + auto parameters_ptr = storage->buffer() + storage->info()->offset_first_element_in_bytes(); + + const auto weights_shape = weights->info()->tensor_shape(); + const auto weights_padding = weights->info()->padding(); + + const size_t ld_weights_col = weights_shape[0] + weights_padding.left + weights_padding.right; + const size_t ld_weights_row = + ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom); + _pImpl->asm_kernel->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weights_row); + + weights->mark_as_unused(); + if (bias != nullptr) + { + bias->mark_as_unused(); + } + _pImpl->is_prepared = true; + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h new file mode 100644 index 0000000000..f1816625d2 --- /dev/null +++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2019-2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H +#define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H + +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +struct ConvolutionInfo; + +namespace cpu +{ +/** Depthwise convolution assembly kernel glue */ +class CpuDepthwiseConv2dAssemblyDispatch : public ICpuOperator +{ +public: + CpuDepthwiseConv2dAssemblyDispatch(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dAssemblyDispatch); + ~CpuDepthwiseConv2dAssemblyDispatch(); + /** Initialize the function's source, destination, kernels and border_size. + * + * @note Supports only NHWC format + * + * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor info. These are 3D tensors with shape [W, H, IFM]. + * Data type supported: same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED. + * @param[in] bias (Optional) Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: same as @p src or S32 if @p src is quantized. + * @param[out] dst Destination tensor info. Data type supported: same as @p src. + * @param[in] info Depthwise convolution meta-data. + */ + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *dst, + const ConvolutionInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDepthwiseConv2dAssemblyDispatch::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const ConvolutionInfo &info); + /** Checks if activation is supported by the assembly kernels + * + * @param[in] activation Activation to check + * + * @return True if activation is supported else false + */ + static bool is_activation_supported(const ActivationLayerInfo &activation); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + struct LocalImpl; + std::unique_ptr<LocalImpl> _pImpl; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H */ diff --git a/src/cpu/operators/CpuDequantize.cpp b/src/cpu/operators/CpuDequantize.cpp new file mode 100644 index 0000000000..c05a23f3a7 --- /dev/null +++ b/src/cpu/operators/CpuDequantize.cpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuDequantize.h" + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuDequantizeKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuDequantize::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::CpuDequantizeKernel>(); + k->configure(src, dst); + _kernel = std::move(k); +} + +Status CpuDequantize::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::CpuDequantizeKernel::validate(src, dst); +} + +void CpuDequantize::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + prepare(tensors); + NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuDequantize.h b/src/cpu/operators/CpuDequantize.h new file mode 100644 index 0000000000..dbfc0c612a --- /dev/null +++ b/src/cpu/operators/CpuDequantize.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DEQUANTIZE_H +#define ARM_COMPUTE_CPU_DEQUANTIZE_H + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuDequantizeKernel that dequantizes an input tensor */ +class CpuDequantize : public ICpuOperator +{ +public: + /** Configure the kernel. + * + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16. + * @param[out] dst Destination tensor info with the same dimensions of input. Data type supported: F16/F32. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuDequantize::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_DEQUANTIZE_H */ diff --git a/src/cpu/operators/CpuDirectConv2d.cpp b/src/cpu/operators/CpuDirectConv2d.cpp new file mode 100644 index 0000000000..135a3bb2b9 --- /dev/null +++ b/src/cpu/operators/CpuDirectConv2d.cpp @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuDirectConv2d.h" + +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" + +namespace arm_compute +{ +namespace cpu +{ +CpuDirectConv2d::~CpuDirectConv2d() = default; + +CpuDirectConv2d::CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), + _output_stage_kernel(), + _conv_kernel(), + _input_border_handler(), + _activationlayer_function(), + _accumulator(), + _has_bias(false), + _is_activationlayer_enabled(false), + _dim_split(Window::DimZ), + _is_padding_required() +{ +} + +void CpuDirectConv2d::configure(ITensorInfo *src, + ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); + ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, conv_info, act_info); + + _output_stage_kernel = std::make_unique<kernels::CpuDirectConv2dOutputStageKernel>(); + _conv_kernel = std::make_unique<kernels::CpuDirectConv2dKernel>(); + _input_border_handler = std::make_unique<NEFillBorderKernel>(); + + // Free accumulator + if (_accumulator.buffer() != nullptr) + { + _accumulator.allocator()->free(); + } + + _dim_split = src->data_layout() == DataLayout::NCHW ? Window::DimZ : Window::DimY; + + // Check if bias should be added in the convolution result + _has_bias = (bias != nullptr); + + _conv_kernel->configure(src, weights, dst, conv_info); + if (_has_bias) + { + _output_stage_kernel->configure(dst, bias); + } + _is_padding_required = !_conv_kernel->border_size().empty(); + + if (_is_padding_required) + { + // Add zero padding XY + _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, + PixelValue(static_cast<float>(0.f))); + } + + //Configure Activation Layer + _is_activationlayer_enabled = act_info.enabled(); + if (_is_activationlayer_enabled) + { + _activationlayer_function = std::make_unique<CpuActivation>(); + _activationlayer_function->configure(dst, dst, act_info); + } +} + +Status CpuDirectConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + + // output might not be initialized since it can be an intermediate tensor of another layer + DataType data_type = src->data_type(); + TensorInfo accumulator(dst->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type)); + + // Validate Convolution kernel + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dKernel::validate(src, weights, &accumulator, conv_info)); + + if (bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3), + "Biases size and number of input feature maps should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional"); + } + + // Validate bias kernel + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dOutputStageKernel::validate(&accumulator, bias, dst)); + + if (act_info.enabled()) + { + ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info)); + } + + return Status{}; +} + +void CpuDirectConv2d::run(ITensorPack &tensors) +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + auto src = tensors.get_tensor(TensorType::ACL_SRC_0); + auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + if (_is_padding_required) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_DST, src); + NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), + pack); + } + NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors); + if (_has_bias) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, dst); + pack.add_tensor(TensorType::ACL_SRC_1, bias); + pack.add_tensor(TensorType::ACL_DST, dst); + NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack); + } + + if (_is_activationlayer_enabled) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, dst); + pack.add_tensor(TensorType::ACL_DST, dst); + _activationlayer_function->run(pack); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuDirectConv2d.h b/src/cpu/operators/CpuDirectConv2d.h new file mode 100644 index 0000000000..73c85f2dcd --- /dev/null +++ b/src/cpu/operators/CpuDirectConv2d.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_H +#define ARM_COMPUTE_CPU_DIRECTCONV2D_H + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" +#include "arm_compute/runtime/Tensor.h" + +#include "src/core/NEON/kernels/NEFillBorderKernel.h" +#include "src/cpu/ICpuKernel.h" +#include "src/cpu/ICpuOperator.h" +#include "src/cpu/kernels/CpuDirectConv2dKernel.h" +#include "src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h" +#include "src/cpu/operators/CpuActivation.h" + +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +/** Function to run the direct convolution. + * + * This function calls the following kernels: + * + * -# @ref NEFillBorderKernel for the input + * -# @ref kernels::CpuDirectConv2dOutputStageKernel + * -# @ref kernels::CpuDirectConv2dKernel + */ +class CpuDirectConv2d : public ICpuOperator +{ +public: + CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + ~CpuDirectConv2d(); + /** Set the input, weights, biases and output tensors. + * + * @note: DirectConvolution only works in the following configurations: + * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32 + * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32 + * 5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32 + * + * @param[in, out] src Input tensor info. Data types supported: F16/F32. + * @param[in] weights Set of kernels to convolve the input volume. + * Supported sizes: 1x1, 3x3 and 5x5. + * The 3rd dimension must be the same as the input's volume 3rd dimension. + * Data type supported: Same as @p src. + * @param[in] bias Set of biases. Can be nullptr. Data type supported: Same as @p src. + * @param[out] dst Output tensor info. + * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(ITensorInfo *src, + ITensorInfo *weights, + const ITensorInfo *bias, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDirectConv2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *bias, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + +private: + MemoryGroup _memory_group; + std::unique_ptr<kernels::CpuDirectConv2dOutputStageKernel> _output_stage_kernel; + std::unique_ptr<kernels::CpuDirectConv2dKernel> _conv_kernel; + std::unique_ptr<NEFillBorderKernel> _input_border_handler; + std::unique_ptr<CpuActivation> _activationlayer_function; + Tensor _accumulator; + bool _has_bias{false}; + bool _is_activationlayer_enabled{false}; + unsigned int _dim_split{0}; + bool _is_padding_required{false}; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_DIRECTCONV2D_H */ diff --git a/src/cpu/operators/CpuDirectConv3d.cpp b/src/cpu/operators/CpuDirectConv3d.cpp new file mode 100644 index 0000000000..626f1c6775 --- /dev/null +++ b/src/cpu/operators/CpuDirectConv3d.cpp @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuDirectConv3d.h" + +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" + +namespace arm_compute +{ +namespace cpu +{ +CpuDirectConv3d::~CpuDirectConv3d() = default; + +CpuDirectConv3d::CpuDirectConv3d(std::shared_ptr<IMemoryManager> memory_manager) + : _memory_group(std::move(memory_manager)), + _conv_kernel(), + _activationlayer_function(), + _accumulator(), + _is_activationlayer_enabled(false), + _dim_split(Window::DimZ) +{ +} + +void CpuDirectConv3d::configure( + ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info) +{ + ARM_COMPUTE_LOG_PARAMS(src0, src1, src2, dst, conv_info); + ARM_COMPUTE_ERROR_ON(src0->data_layout() != DataLayout::NDHWC); + + _conv_kernel = std::make_unique<kernels::CpuDirectConv3dKernel>(); + + // Free accumulator + if (_accumulator.buffer() != nullptr) + { + _accumulator.allocator()->free(); + } + + _dim_split = Window::DimY; + + _conv_kernel->configure(src0, src1, src2, dst, conv_info); + + //Configure Activation Layer + _is_activationlayer_enabled = conv_info.act_info.enabled(); + if (_is_activationlayer_enabled) + { + _activationlayer_function = std::make_unique<CpuActivation>(); + _activationlayer_function->configure(dst, dst, conv_info.act_info); + } +} + +Status CpuDirectConv3d::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo conv_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + + // Validate Convolution kernel + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv3dKernel::validate(src0, src1, src2, dst, conv_info)); + + if (conv_info.act_info.enabled()) + { + ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, conv_info.act_info)); + } + + return Status{}; +} + +void CpuDirectConv3d::run(ITensorPack &tensors) +{ + MemoryGroupResourceScope scope_mg(_memory_group); + + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors); + + if (_is_activationlayer_enabled) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, dst); + pack.add_tensor(TensorType::ACL_DST, dst); + _activationlayer_function->run(pack); + } +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuDirectConv3d.h b/src/cpu/operators/CpuDirectConv3d.h new file mode 100644 index 0000000000..3ad1e09a14 --- /dev/null +++ b/src/cpu/operators/CpuDirectConv3d.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_DIRECTCONV3D_H +#define ARM_COMPUTE_CPU_DIRECTCONV3D_H + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/FunctionDescriptors.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h" +#include "arm_compute/runtime/Tensor.h" + +#include "src/core/NEON/kernels/NEFillBorderKernel.h" +#include "src/cpu/ICpuKernel.h" +#include "src/cpu/ICpuOperator.h" +#include "src/cpu/kernels/CpuDirectConv3dKernel.h" +#include "src/cpu/operators/CpuActivation.h" + +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +/** Function to run the direct convolution. + * + * This function calls the following kernels: + * + * -# @ref kernels::CpuDirectConv3dKernel + */ +class CpuDirectConv3d : public ICpuOperator +{ +public: + CpuDirectConv3d(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + ~CpuDirectConv3d(); + /** Set the input, weights, biases and output tensor info. + * + * Valid data layouts: + * - NDHWC + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * + * @param[in, out] src0 Input tensor info. + * @param[in] src1 Set of kernels to convolve the input volume. + * The 2nd dimension must be the same as the src0's volume 1st dimension. + * @param[in] src2 Set of biases. Can be nullptr. + * @param[out] dst Output tensor info. + * The 1st dimensions must be equal to the 1st dimension of the @p kernels tensor. + * @param[in] conv_info Contains padding, stride, acitvation information. + */ + void configure( + ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuDirectConv3d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo conv_info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + +private: + MemoryGroup _memory_group; + std::unique_ptr<kernels::CpuDirectConv3dKernel> _conv_kernel; + std::unique_ptr<CpuActivation> _activationlayer_function; + Tensor _accumulator; + bool _is_activationlayer_enabled{false}; + unsigned int _dim_split{0}; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_DIRECTCONV3D_H */ diff --git a/src/cpu/operators/CpuElementwise.cpp b/src/cpu/operators/CpuElementwise.cpp new file mode 100644 index 0000000000..c2ae8773c6 --- /dev/null +++ b/src/cpu/operators/CpuElementwise.cpp @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuElementwise.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/CpuElementwiseKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuElementwiseBase::run(ITensorPack &tensors) +{ + // If the kernel has been configured, use the window from the kernel. + if (_kernel->is_window_configured()) + { + ICpuOperator::run(tensors); + return; + } + + auto src0_info = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info(); + auto src1_info = tensors.get_const_tensor(TensorType::ACL_SRC_1)->info(); + auto shape_and_window = compute_output_shape_and_window(src0_info->tensor_shape(), src1_info->tensor_shape()); + ICpuOperator::run(tensors, shape_and_window.second); +} + +template <ArithmeticOperation op> +void CpuElementwiseArithmetic<op>::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src0, src1, dst); + auto k = std::make_unique<kernels::CpuArithmeticKernel>(); + k->configure(op, src0, src1, dst); + _kernel = std::move(k); +} + +template <ArithmeticOperation op> +Status CpuElementwiseArithmetic<op>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +{ + return kernels::CpuArithmeticKernel::validate(op, src0, src1, dst); +} + +template class CpuElementwiseArithmetic<ArithmeticOperation::MAX>; +template class CpuElementwiseArithmetic<ArithmeticOperation::MIN>; +template class CpuElementwiseArithmetic<ArithmeticOperation::SQUARED_DIFF>; +template class CpuElementwiseArithmetic<ArithmeticOperation::PRELU>; + +void CpuElementwiseDivision::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src0, src1, dst); + auto k = std::make_unique<kernels::CpuDivisionKernel>(); + k->configure(src0, src1, dst); + _kernel = std::move(k); +} + +Status CpuElementwiseDivision::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +{ + return kernels::CpuDivisionKernel::validate(src0, src1, dst); +} + +void CpuElementwisePower::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src0, src1, dst); + auto k = std::make_unique<kernels::CpuPowerKernel>(); + k->configure(src0, src1, dst); + _kernel = std::move(k); +} + +Status CpuElementwisePower::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +{ + return kernels::CpuPowerKernel::validate(src0, src1, dst); +} + +template <ComparisonOperation COP> +void CpuElementwiseComparisonStatic<COP>::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src0, src1, dst); + auto k = std::make_unique<kernels::CpuComparisonKernel>(); + k->configure(COP, src0, src1, dst); + _kernel = std::move(k); +} + +template <ComparisonOperation COP> +Status +CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst) +{ + return kernels::CpuComparisonKernel::validate(COP, src0, src1, dst); +} + +void CpuElementwiseComparison::configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ComparisonOperation op) +{ + ARM_COMPUTE_LOG_PARAMS(src0, src1, dst); + auto k = std::make_unique<kernels::CpuComparisonKernel>(); + k->configure(op, src0, src1, dst); + _kernel = std::move(k); +} + +Status CpuElementwiseComparison::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ComparisonOperation op) +{ + return kernels::CpuComparisonKernel::validate(op, src0, src1, dst); +} + +// Supported Specializations +template class CpuElementwiseComparisonStatic<ComparisonOperation::Equal>; +template class CpuElementwiseComparisonStatic<ComparisonOperation::NotEqual>; +template class CpuElementwiseComparisonStatic<ComparisonOperation::Greater>; +template class CpuElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>; +template class CpuElementwiseComparisonStatic<ComparisonOperation::Less>; +template class CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>; +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuElementwise.h b/src/cpu/operators/CpuElementwise.h new file mode 100644 index 0000000000..5db53c8026 --- /dev/null +++ b/src/cpu/operators/CpuElementwise.h @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_H +#define ARM_COMPUTE_CPU_ELEMENTWISE_H + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +class CpuElementwiseBase : public ICpuOperator +{ +public: + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for division and power + * + * @note Max/Min/Squared difference supports input data type of QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32 + * @note PRelu supports inpute data type of QASYMM8/QASYMM8_SIGNED/F16/F32. + */ +template <ArithmeticOperation op> +class CpuElementwiseArithmetic : public CpuElementwiseBase +{ +public: + /** Configure the operator + * + * @param[in] src0 The first source tensor information. + * @param[in] src1 The second source tensor information. With PRelu, this is used as alpha tensor. + * @param[out] dst The output tensor information. + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuElementwiseArithmetic::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); +}; + +/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for maximum operation */ +using CpuElementwiseMax = CpuElementwiseArithmetic<ArithmeticOperation::MAX>; +/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for minimum operation */ +using CpuElementwiseMin = CpuElementwiseArithmetic<ArithmeticOperation::MIN>; +/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for squared difference operation */ +using CpuElementwiseSquaredDiff = CpuElementwiseArithmetic<ArithmeticOperation::SQUARED_DIFF>; + +/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for division + * + * @note The tensor data type for the inputs must be S32/F16/F32. + * @note The function performs a division operation between two tensors (i.e., out[i] = in1[i] / in2[i]) + */ +class CpuElementwiseDivision : public CpuElementwiseBase +{ +public: + /** Initialise the kernel's inputs, dst and conversion policy. + * + * @param[in, out] src0 First tensor input info. Data types supported: S32/F16/F32. + * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0. + * @param[out] dst Output tensor info. Data types supported: Same as @p src0. + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuElementwiseDivision::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); +}; + +/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for power + * + * @note The tensor data type for the inputs must be F16/F32. + * @note The function performs a elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i]) + * @note For an exponent that is a float, this function will only work with a positive base. + */ +class CpuElementwisePower : public CpuElementwiseBase +{ +public: + /** Initialise the kernel's inputs, dst and conversion policy. + * + * @param[in, out] src0 First tensor input info. Data types supported: F16/F32. + * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0. + * @param[out] dst Output tensor info. Data types supported: Same as @p src0. + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuElementwisePower::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); +}; + +/** Basic function to run @ref cpu::kernels::CpuComparisonKernel. + * + * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. + * @note The function performs a comparison operation between two tensors. + */ +class CpuElementwiseComparison : public CpuElementwiseBase +{ +public: + /** Initialise the kernel's inputs, dst and conversion policy. + * + * @param[in, out] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. + * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0. + * @param[out] dst Output tensor info. Data types supported: U16/U32. + * @param[in] op Comparison Operation to be performed. + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuElementwiseComparison::configure() + * + * @return a status + */ + static Status + validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op); +}; + +/** Basic function to run @ref cpu::kernels::CpuComparisonKernel + * + * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. + * @note The function performs a comparison operation between two tensors. + */ +template <ComparisonOperation op> +class CpuElementwiseComparisonStatic : public CpuElementwiseBase +{ +public: + /** Initialise the kernel's inputs, dst and conversion policy. + * + * @param[in, out] src0 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32. + * @param[in, out] src1 Second tensor input info. Data types supported: Same as @p src0. + * @param[out] dst Output tensor info. Data types supported: U16/U32. + */ + void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuElementwiseComparisonStatic::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst); +}; + +/** Basic function to run equal comparison. */ +using NEEqual = CpuElementwiseComparisonStatic<ComparisonOperation::Equal>; +/** Basic function to run not equal comparison. */ +using NENotEqual = CpuElementwiseComparisonStatic<ComparisonOperation::NotEqual>; +/** Basic function to run greater comparison. */ +using NEGreater = CpuElementwiseComparisonStatic<ComparisonOperation::Greater>; +/** Basic function to run greater-equal comparison. */ +using NEGreaterEqual = CpuElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>; +/** Basic function to run less comparison. */ +using NELess = CpuElementwiseComparisonStatic<ComparisonOperation::Less>; +/** Basic function to run less-equal comparison. */ +using NELessEqual = CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>; +} // namespace cpu +} // namespace arm_compute + +#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */ diff --git a/src/cpu/operators/CpuElementwiseUnary.cpp b/src/cpu/operators/CpuElementwiseUnary.cpp new file mode 100644 index 0000000000..04ab7bf8f5 --- /dev/null +++ b/src/cpu/operators/CpuElementwiseUnary.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuElementwiseUnary.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/cpu/kernels/CpuElementwiseUnaryKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +using KernelType = kernels::CpuElementwiseUnaryKernel; + +void CpuElementwiseUnary::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst) +{ + ARM_COMPUTE_LOG_PARAMS(op, src, dst); + auto k = std::make_unique<KernelType>(); + k->configure(op, src, dst); + _kernel = std::move(k); +} + +Status CpuElementwiseUnary::validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst) +{ + return KernelType::validate(op, src, dst); +} + +void CpuElementwiseUnary::run(ITensorPack &tensors) +{ + if (_kernel->is_window_configured()) + { + ICpuOperator::run(tensors); + return; + } + + auto src_info = tensors.get_const_tensor(TensorType::ACL_SRC)->info(); + ICpuOperator::run(tensors, compute_output_shape_and_window(src_info->tensor_shape()).second); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuElementwiseUnary.h b/src/cpu/operators/CpuElementwiseUnary.h new file mode 100644 index 0000000000..1e51bfaa1c --- /dev/null +++ b/src/cpu/operators/CpuElementwiseUnary.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H +#define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H + +#include "arm_compute/core/Types.h" + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +class CpuElementwiseUnary : public ICpuOperator +{ +public: + /** Initialize the function + * + * @param[in] op Unary operation to execute + * @param[in] src Input tensor information. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations. + * @param[out] dst Output tensor information. Data types supported: Same as @p src. + */ + void configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuElementwiseUnary::configure() + * + * @return a status + */ + static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; + +} // namespace cpu +} // namespace arm_compute + +#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */ diff --git a/src/cpu/operators/CpuFill.cpp b/src/cpu/operators/CpuFill.cpp new file mode 100644 index 0000000000..1890d0b916 --- /dev/null +++ b/src/cpu/operators/CpuFill.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuFill.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuFillKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuFill::configure(const ITensorInfo *tensor, PixelValue constant_value) +{ + ARM_COMPUTE_LOG_PARAMS(tensor, constant_value); + auto k = std::make_unique<kernels::CpuFillKernel>(); + k->configure(tensor, constant_value); + _kernel = std::move(k); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuFill.h b/src/cpu/operators/CpuFill.h new file mode 100644 index 0000000000..cb83745d29 --- /dev/null +++ b/src/cpu/operators/CpuFill.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2021,2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_FILL_H +#define ARM_COMPUTE_CPU_FILL_H + +#include "arm_compute/core/PixelValue.h" + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuFillKernel */ +class CpuFill : public ICpuOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @param[in,out] tensor Tensor to fill. Supported data types: All + * @param[in] constant_value The value used to fill the planes of the tensor + */ + void configure(const ITensorInfo *tensor, PixelValue constant_value); +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_FILL_H */ diff --git a/src/cpu/operators/CpuFlatten.cpp b/src/cpu/operators/CpuFlatten.cpp new file mode 100644 index 0000000000..2609d44590 --- /dev/null +++ b/src/cpu/operators/CpuFlatten.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuFlatten.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/operators/CpuReshape.h" + +namespace arm_compute +{ +namespace cpu +{ +CpuFlatten::CpuFlatten() : _reshape(nullptr) +{ +} + +CpuFlatten::~CpuFlatten() = default; + +void CpuFlatten::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + _reshape = std::make_unique<CpuReshape>(); + _reshape->configure(src, dst); +} + +Status CpuFlatten::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return CpuReshape::validate(src, dst); +} + +void CpuFlatten::run(ITensorPack &tensors) +{ + _reshape->run(tensors); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuFlatten.h b/src/cpu/operators/CpuFlatten.h new file mode 100644 index 0000000000..911760dd95 --- /dev/null +++ b/src/cpu/operators/CpuFlatten.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_CPUFLATTEN_H +#define ACL_SRC_CPU_OPERATORS_CPUFLATTEN_H + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +class CpuReshape; +/** Basic function to flatten a given input */ +class CpuFlatten : public ICpuOperator +{ +public: + /** Constructor */ + CpuFlatten(); + /** Destructor */ + ~CpuFlatten(); + /** Configure operator for a given list of arguments + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |All |All | + * + * @param[in] src Source tensor to flatten with at least 3 dimensions. + * The dimensions above the third will be interpreted as batches. Data types supported: All + * @param[in] dst Destination tensor with shape [w*h*d, input_batches] where: + * w = width input tensor, h = height input tensor and d = depth input tensor. + * Data type supported: same as @p src + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuFlatten::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + +private: + std::unique_ptr<CpuReshape> _reshape; +}; +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_OPERATORS_CPUFLATTEN_H diff --git a/src/cpu/operators/CpuFloor.cpp b/src/cpu/operators/CpuFloor.cpp new file mode 100644 index 0000000000..a107393b01 --- /dev/null +++ b/src/cpu/operators/CpuFloor.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuFloor.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuFloorKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuFloor::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::CpuFloorKernel>(); + k->configure(src, dst); + _kernel = std::move(k); +} + +Status CpuFloor::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::CpuFloorKernel::validate(src, dst); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuFloor.h b/src/cpu/operators/CpuFloor.h new file mode 100644 index 0000000000..6082f98867 --- /dev/null +++ b/src/cpu/operators/CpuFloor.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_FLOOR_H +#define ARM_COMPUTE_CPU_FLOOR_H + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuFloorKernel */ +class CpuFloor : public ICpuOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @param[in] src Source tensor info. Data types supported: F16/F32. + * @param[in] dst Destination tensor info. Data type supported: same as @p src + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuFloor::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_FLOOR_H */ diff --git a/src/cpu/operators/CpuFullyConnected.cpp b/src/cpu/operators/CpuFullyConnected.cpp new file mode 100644 index 0000000000..85a0b0311b --- /dev/null +++ b/src/cpu/operators/CpuFullyConnected.cpp @@ -0,0 +1,590 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuFullyConnected.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/utils/quantization/AsymmHelpers.h" +#include "src/cpu/kernels/CpuTransposeKernel.h" +#include "src/cpu/operators/CpuConvertFullyConnectedWeights.h" +#include "src/cpu/operators/CpuFlatten.h" +#include "src/cpu/operators/CpuGemm.h" +#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h" +#include "src/cpu/utils/CpuAuxTensorHandler.h" + +namespace arm_compute +{ +namespace cpu +{ +using namespace arm_compute::experimental; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +Status get_gemmlowp_output_stage_info(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const ActivationLayerInfo &act, + GEMMLowpOutputStageInfo &gemmlowp_output_stage_info) +{ + const auto data_type = src->data_type(); + const QuantizationInfo oq_info = dst->quantization_info(); + const UniformQuantizationInfo iq_unif = src->quantization_info().uniform(); + const UniformQuantizationInfo wq_unif = weights->quantization_info().uniform(); + const UniformQuantizationInfo oq_unif = oq_info.uniform(); + + float multiplier = (iq_unif.scale * wq_unif.scale) / oq_unif.scale; + int32_t output_multiplier; + int32_t output_shift; + + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + + int32_t type_min = 0; + int32_t type_max = 0; + std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type); + + gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier; + gemmlowp_output_stage_info.gemmlowp_shift = output_shift; + gemmlowp_output_stage_info.gemmlowp_offset = oq_unif.offset; + gemmlowp_output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + gemmlowp_output_stage_info.gemmlowp_min_bound = type_min; + gemmlowp_output_stage_info.gemmlowp_max_bound = type_max; + + return Status{}; +} + +Status validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ActivationLayerInfo &act, + bool enable_fast_math, + WeightFormat weight_format) +{ + if (is_data_type_quantized_asymmetric(src->data_type())) + { + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate src and weights offset + const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, + -src->quantization_info().uniform().offset); + const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, + -weights->quantization_info().uniform().offset); + + GEMMLowpOutputStageInfo gemmlowp_output_stage_info; + ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(src, weights, dst, act, gemmlowp_output_stage_info)); + + GEMMInfo gemm_info; + gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info); + gemm_info.set_fast_math(enable_fast_math); + + // Validate gemmlowp function + TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info); + TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info); + ARM_COMPUTE_RETURN_ON_ERROR( + CpuGemmLowpMatrixMultiplyCore::validate(&src_info, &weights_info, biases, dst, gemm_info)); + } + else + { + GEMMInfo gemm_info; + gemm_info.set_weight_format(weight_format); + gemm_info.set_fixed_format(weight_format != WeightFormat::UNSPECIFIED); + gemm_info.set_fast_math(enable_fast_math); + ARM_COMPUTE_RETURN_ON_ERROR(CpuGemm::validate(src, weights, biases, dst, 1.f, 1.0f, gemm_info)); + } + + return Status{}; +} +} // namespace + +CpuFullyConnected::CpuFullyConnected() + : _flatten(nullptr), + _convert_weights(nullptr), + _transpose_weights(nullptr), + _mm_gemm(nullptr), + _mm_gemmlowp(nullptr), + _flattened_src(), + _converted_weights(), + _reshaped_weights(), + _trans_weights(), + _trans_weights_idx(AuxTensorIdx::Count), + _aux_mem(Count), + _needs_weights_conversion(false), + _needs_weights_reshape(false), + _is_fc_after_conv(false), + _is_quantized_asymmetric(false), + _is_prepared(false), + _enable_fast_math(false), + _fixed_format(false), + _weight_format(arm_compute::WeightFormat::UNSPECIFIED), + _dynamic_weights(false) +{ +} + +CpuFullyConnected::~CpuFullyConnected() = default; + +void CpuFullyConnected::configure_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act) +{ + if (_is_quantized_asymmetric) + { + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate src and weights offset + const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, + -src->quantization_info().uniform().offset); + const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, + -weights->quantization_info().uniform().offset); + + TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info); + TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info); + + // Configure gemmlowp function and output stage for asymmetric quantized types + GEMMLowpOutputStageInfo gemmlowp_output_stage_info; + const Status status = + get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info); + ARM_COMPUTE_ERROR_ON(status.error_code() != ErrorCode::OK); + + GEMMInfo gemm_info; + gemm_info.set_gemmlowp_output_stage(gemmlowp_output_stage_info); + gemm_info.set_activation_info(act); + gemm_info.set_fast_math(_enable_fast_math); + _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>(); + _mm_gemmlowp->configure(&src_info, &weights_info, biases, dst, gemm_info); + } + else + { + // Configure matrix multiply kernel + GEMMInfo gemm_info; + gemm_info.set_activation_info(act); + gemm_info.set_fast_math(_enable_fast_math); + gemm_info.set_fixed_format(_fixed_format); + gemm_info.set_weight_format(_weight_format); + _mm_gemm = std::make_unique<CpuGemm>(); + _mm_gemm->configure(src, weights, biases, dst, 1.f, 1.0f, gemm_info); + } +} + +void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act) +{ + ARM_COMPUTE_ERROR_ON((weights->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); + + // If the fully connected layer is called after a convolution layer, the src tensor must be linearized + + // Initialize output tensor for flatten + auto_init_if_empty(_flattened_src, src->clone()->set_tensor_shape(compute_flatten_shape(src))); + + _flatten = std::make_unique<CpuFlatten>(); + _flatten->configure(src, &_flattened_src); + + // Configure matrix multiply kernel + configure_mm(&_flattened_src, weights, biases, dst, act); +} + +void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act) +{ + ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension(1)); + + // Configure matrix multiply kernel + configure_mm(src, weights, biases, dst, act); +} + +void CpuFullyConnected::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + FullyConnectedLayerInfo fc_info, + const WeightsInfo &weights_info) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_ERROR_THROW_ON( + CpuFullyConnected::validate(src, weights, biases != nullptr ? biases : nullptr, dst, fc_info, weights_info)); + ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, fc_info); + + _needs_weights_conversion = false; + _needs_weights_reshape = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false; + _needs_weights_reshape = _needs_weights_reshape && !fc_info.retain_internal_weights; + _is_fc_after_conv = true; + _is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type()); + _is_prepared = false; + _trans_weights_idx = AuxTensorIdx::Count; + _enable_fast_math = fc_info.enable_fast_math; + _fixed_format = weights_info.weight_format() != WeightFormat::UNSPECIFIED; + _weight_format = weights_info.weight_format(); + _dynamic_weights = !weights->are_values_constant() && _needs_weights_reshape; + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensorInfo *weights_to_use = weights; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = dst->dimension(1) > 1; + if (is_batched_fc_layer) + { + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), + dst->tensor_shape().cbegin() + 1)); + } + else + { + _is_fc_after_conv = src->num_dimensions() > 1; + } + + // Reshape weights if needed + if (_needs_weights_reshape) + { + // Reshape the weights + _transpose_weights = std::make_unique<kernels::CpuTransposeKernel>(); + _transpose_weights->configure(weights, &_reshaped_weights); + _reshaped_weights.set_are_values_constant(weights->are_values_constant()); + + weights_to_use = &_reshaped_weights; + _trans_weights_idx = AuxTensorIdx::TransposedWeights; + } + + // Convert weights if needed + if (_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) + { + // Convert weights + _convert_weights = std::make_unique<CpuConvertFullyConnectedWeights>(); + _convert_weights->configure(weights_to_use, &_converted_weights, src->tensor_shape(), + fc_info.weights_trained_layout); + _converted_weights.set_are_values_constant(weights_to_use->are_values_constant()); + + weights_to_use = &_converted_weights; + _needs_weights_conversion = true; + _trans_weights_idx = AuxTensorIdx::ConvertedWeights; + } + + if (_is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + configure_conv_fc(src, weights_to_use, biases, dst, fc_info.activation_info); + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + configure_fc_fc(src, weights_to_use, biases, dst, fc_info.activation_info); + } + + // Retain the tensorinfo with the weights to use + if (_needs_weights_reshape || _needs_weights_conversion) + { + _trans_weights = *weights_to_use; + } + + // Set auxiliary memory requirements + auto gemm_mem_req = (_is_quantized_asymmetric) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace(); + for (unsigned int i = 0; i < gemm_mem_req.size(); ++i) + { + _aux_mem[i] = gemm_mem_req[i]; + } + + if (_aux_mem[Pretranspose].size > 0) + { + // Release permuted weights at the end of prepare as they are further transposed by the assembly dispatch + // Do not release them if biases are dynamic and data type is quantized, since the weights tensor will be used for biases offset calculation + // Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time. + _aux_mem[TransposedWeights] = MemoryInfo( + offset_int_vec(TransposedWeights), + _dynamic_weights ? MemoryLifetime::Temporary + : (_is_quantized_asymmetric && biases && !(biases->are_values_constant())) ? MemoryLifetime::Persistent + : MemoryLifetime::Prepare, + _reshaped_weights.total_size()); + + _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), + _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare, + _converted_weights.total_size()); + } + else + { + _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), + _dynamic_weights ? MemoryLifetime::Temporary + : _needs_weights_conversion ? MemoryLifetime::Prepare + : MemoryLifetime::Persistent, + _reshaped_weights.total_size()); + + _aux_mem[ConvertedWeights] = MemoryInfo( + offset_int_vec(ConvertedWeights), _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Persistent, + _converted_weights.total_size()); + } + _aux_mem[FlattenedSrc] = + MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size()); +} + +Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info, + WeightsInfo weights_info) +{ + GEMMInfo gemm_info; + gemm_info.set_activation_info(fc_info.activation_info); + gemm_info.set_fast_math(fc_info.enable_fast_math); + gemm_info.set_fixed_format(weights_info.weight_format() != WeightFormat::UNSPECIFIED); + gemm_info.set_weight_format(weights_info.weight_format()); + + return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info); +} + +Status CpuFullyConnected::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info, + const WeightsInfo &weights_info) +{ + ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + + if (is_fixed_format_fast_math(weights_info.weight_format())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(src, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(weights, DataType::BFLOAT16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(dst, DataType::F32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst); + } + + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON( + fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); + + bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + bool is_fc_after_conv = true; + + const ITensorInfo &flatten_src = + TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src))); + const ITensorInfo &reshaped_weights = TensorInfo( + weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); + const ITensorInfo &converted_weights = weights_reshaped + ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) + : TensorInfo(*reshaped_weights.clone()); + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensorInfo *src_to_use = src; + const ITensorInfo *weights_to_use = weights; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = dst->dimension(1) > 1; + + if (biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); + if (is_data_type_quantized(src->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); + } + } + + if (is_batched_fc_layer) + { + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), + dst->tensor_shape().cbegin() + 1)); + } + else + { + is_fc_after_conv = src->num_dimensions() > 1; + } + + if (!weights_reshaped) + { + // Validate reshape weights kernel + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuTransposeKernel::validate(weights, &reshaped_weights)); + weights_to_use = &reshaped_weights; + } + + if (is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) + { + // Validate convert weights kernel + ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate( + weights_to_use, &converted_weights, src->tensor_shape(), fc_info.weights_trained_layout)); + weights_to_use = &converted_weights; + } + + if (is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON( + (weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); + + // Validate flatten kernel + ARM_COMPUTE_RETURN_ON_ERROR(CpuFlatten::validate(src, &flatten_src)); + src_to_use = &flatten_src; + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(1)); + } + // Validate matrix multiply kernel + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info, + fc_info.enable_fast_math, weights_info.weight_format())); + + return Status{}; +} + +void CpuFullyConnected::run(ITensorPack &tensors) +{ + prepare(tensors); + +#ifdef ARM_COMPUTE_ASSERTS_ENABLED + ++_asrt_run_count; + ARM_COMPUTE_ERROR_ON(_dynamic_weights && _asrt_prepare_count != _asrt_run_count); +#endif // ARM_COMPUTE_ASSERTS_ENABLED + + auto src = tensors.get_const_tensor(ACL_SRC_0); + + CpuAuxTensorHandler flattened_src(offset_int_vec(FlattenedSrc), _flattened_src, tensors, false); + CpuAuxTensorHandler transformed_wei(offset_int_vec(_trans_weights_idx), _trans_weights, tensors, false); + + // Linearize src if it comes from a convolutional layer + if (_is_fc_after_conv) + { + ITensorPack flatten_pack{{ACL_SRC, src}, {ACL_DST, flattened_src.get()}}; + _flatten->run(flatten_pack); + } + + ITensorPack gemm_pack = tensors; + gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src); + if (_needs_weights_reshape || _needs_weights_conversion) + { + gemm_pack.add_const_tensor(ACL_SRC_1, transformed_wei.get()); + } + + // Run matrix multiply + if (_is_quantized_asymmetric) + { + _mm_gemmlowp->run(gemm_pack); + } + else + { + _mm_gemm->run(gemm_pack); + } +} + +void CpuFullyConnected::prepare(ITensorPack &tensors) +{ + if (!_is_prepared || _dynamic_weights) + { +#ifdef ARM_COMPUTE_ASSERTS_ENABLED + ++_asrt_prepare_count; + ARM_COMPUTE_ERROR_ON(!_dynamic_weights && _asrt_prepare_count > 1); +#endif // ARM_COMPUTE_ASSERTS_ENABLED + + auto weights = tensors.get_const_tensor(ACL_SRC_1); + + CpuAuxTensorHandler reshaped_weights(offset_int_vec(TransposedWeights), _reshaped_weights, tensors, false); + CpuAuxTensorHandler converted_weights(offset_int_vec(ConvertedWeights), _converted_weights, tensors, false); + + // Pointer to current weights + const ITensor *cur_weights = weights; + + // Reshape of the weights (happens only once) + if (_needs_weights_reshape) + { + // Run reshape weights kernel and mark weights as unused + ITensorPack transpose_pack{{ACL_SRC, weights}, {ACL_DST, reshaped_weights.get()}}; + NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(), + transpose_pack); + + cur_weights->mark_as_unused(); + cur_weights = reshaped_weights.get(); + } + + // Convert weights if needed (happens only once) + if (_needs_weights_conversion) + { + ITensorPack convert_pack{{ACL_SRC, cur_weights}, {ACL_DST, converted_weights.get()}}; + _convert_weights->run(convert_pack); + + cur_weights->mark_as_unused(); + cur_weights = converted_weights.get(); + } + + ITensorPack gemm_pack = tensors; + gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights); + + // Prepare GEMM prepare and release unused weights + if (!_is_quantized_asymmetric) + { + _mm_gemm->prepare(gemm_pack); + } + else + { + _mm_gemmlowp->prepare(gemm_pack); + } + + _is_prepared = true; + } +} + +experimental::MemoryRequirements CpuFullyConnected::workspace() const +{ + return _aux_mem; +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuFullyConnected.h b/src/cpu/operators/CpuFullyConnected.h new file mode 100644 index 0000000000..b72f77e5c4 --- /dev/null +++ b/src/cpu/operators/CpuFullyConnected.h @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_CPUFULLYCONNECTED_H +#define ACL_SRC_CPU_OPERATORS_CPUFULLYCONNECTED_H + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/function_info/FullyConnectedLayerInfo.h" + +#include "src/cpu/ICpuOperator.h" + +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +// Forward declarations +class CpuConvertFullyConnectedWeights; +class CpuFlatten; +class CpuGemm; +class CpuGemmLowpMatrixMultiplyCore; +namespace kernels +{ +class CpuTransposeKernel; +} // namespace kernels +/** Basic function to compute a Fully Connected layer. This function calls the following kernels: + * -# @ref kernels::CpuIm2ColKernel (called when the input comes from a convolutional layer) + * -# @ref kernels::CpuTransposeKernel (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once) + * -# @ref CpuGemm or @ref CpuGemmLowpMatrixMultiplyCore (if quantized asymmetric) + * -# @ref kernels::CpuGemmMatrixAdditionKernel or @ref CpuGemmLowpOutputStage (if quantized asymmetric) (if @p biases is not equal to nullptr) + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class CpuFullyConnected : public ICpuOperator +{ +public: + /** Constructor */ + CpuFullyConnected(); + /** Destructor */ + ~CpuFullyConnected(); + /** Set the input and output tensors. + * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * + * @param[in] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor info. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension. + * Data type supported: Same as @p src. + * @param[in] biases Bias tensor info. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED. + * @param[out] dst Destination tensor info. Its shape should be equal to the output of a matrix multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer. + * Data type supported: Same as @p src. + * @param[in] fc_info (Optional) Fully connected layer additional info + * @param[in] weights_info (Optional) Stores neccessary compute information when weights are already reshaped + */ + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), + const WeightsInfo &weights_info = WeightsInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref CpuFullyConnected + * + * Similar to @ref CpuFullyConnected::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), + const WeightsInfo &weights_info = WeightsInfo()); + + /** Static function that queries whether there exists fixed-format kernel and if it exists it will return in the first argument in what format + * weights are expected to be reshaped as defined by WeightFormat class. Apart from the first argument the rest of the arguments are the same + * as in @ref CpuFullyConnectedLayer::validate() except that all arguments are required. + * + * @return a status + */ + static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info, + WeightsInfo weights_info); + + //Inherited methods override + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + void configure_fc_fc(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act); + void configure_conv_fc(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act); + void configure_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act); + + enum AuxTensorIdx + { + AsmGemmWorkspace = 0, + Pretranspose, + GemmTemp1, + GemmTemp2, + GemmTemp3, + GemmTemp4, + GemmTemp5, + GemmTemp6, + GemmTemp7, + GemmTemp8, + // Slots above (0-9) reserved for either CpuGemm or CpuGemmLowpMatrixMultiplyCore + TransposedWeights, + ConvertedWeights, + FlattenedSrc, + Count + }; + + std::unique_ptr<CpuFlatten> _flatten; + std::unique_ptr<CpuConvertFullyConnectedWeights> _convert_weights; + std::unique_ptr<kernels::CpuTransposeKernel> _transpose_weights; + std::unique_ptr<CpuGemm> _mm_gemm; + std::unique_ptr<CpuGemmLowpMatrixMultiplyCore> _mm_gemmlowp; + + TensorInfo _flattened_src; + TensorInfo _converted_weights; + TensorInfo _reshaped_weights; + TensorInfo _trans_weights; + AuxTensorIdx _trans_weights_idx; + + experimental::MemoryRequirements _aux_mem; + + bool _needs_weights_conversion; + bool _needs_weights_reshape; + bool _is_fc_after_conv; + bool _is_quantized_asymmetric; + bool _is_prepared; + bool _enable_fast_math; + bool _fixed_format; + arm_compute::WeightFormat _weight_format; + bool _dynamic_weights; + +#ifdef ARM_COMPUTE_ASSERTS_ENABLED + int _asrt_run_count{}; + int _asrt_prepare_count{}; +#endif // ARM_COMPUTE_ASSERTS_ENABLED +}; +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_OPERATORS_CPUFULLYCONNECTED_H diff --git a/src/cpu/operators/CpuGemm.cpp b/src/cpu/operators/CpuGemm.cpp new file mode 100644 index 0000000000..905e86c185 --- /dev/null +++ b/src/cpu/operators/CpuGemm.cpp @@ -0,0 +1,567 @@ +/* + * Copyright (c) 2021-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuGemm.h" + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/utils/CpuAuxTensorHandler.h" + +using namespace arm_compute::experimental; +using namespace arm_compute::misc::shape_calculator; + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) +{ + cpu::AsmGemmInfo asm_info; + asm_info.method = cpu::AsmConvMethod::Im2Col; + asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d(); + asm_info.depth_output_gemm3d = info.depth_output_gemm3d(); + asm_info.activation_info = info.activation_info(); + asm_info.fast_mode = info.fast_math(); + asm_info.fixed_format = info.fixed_format(); + asm_info.weight_format = info.weight_format(); + asm_info.accumulate = info.accumulate(); + asm_info.transpose_b = + info.pretranspose_B(); // The "pretranspose_B" flag here is not the same as the pretranspose_B_array method. The flag here signals to pretranspose_B_array method if we want to perform additional transpose on B before the pretranspose_B_array method + + return asm_info; +} +} // namespace + +void CpuGemm::configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); + ARM_COMPUTE_ERROR_THROW_ON(CpuGemm::validate(a, b, c, d, alpha, beta, gemm_info)); + ARM_COMPUTE_LOG_PARAMS(a, b, c, d, alpha, beta, gemm_info); + + const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); + const bool is_c_bias = beta == 1 && c != nullptr; + const bool run_optimised = + bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) && + (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient. + !(!b->are_values_constant() && + b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently. + + // Check if we need to reshape the matrix B only on the first run + _is_prepared = false; + _reshape_b_only_on_first_run = b->are_values_constant(); + _run_vector_matrix_multiplication = a->dimension(1) < 2; + _run_alpha_scale = alpha != 1.f; + _run_bias_addition = is_c_bias; + _run_addition = beta != 0 && beta != 1 && c != nullptr; + _run_activation = + gemm_info.activation_info().enabled() && + (!run_optimised || + (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info()))); + + if (run_optimised) + { + _run_interleave_transpose = false; + const ITensorInfo *c_to_use = is_c_bias ? c : nullptr; + _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>(); + _asm_glue->configure(a, b, c_to_use, d, asm_info); + ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured()); + + const auto asm_mem_req = _asm_glue->workspace(); + for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot) + { + _aux_mem[slot] = asm_mem_req[slot]; + } + + // Scale product by alpha + if (_run_alpha_scale) + { + _alpha_scale_func = std::make_unique<cpu::CpuActivation>(); + _alpha_scale_func->configure( + d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f)); + } + } + else + { + _run_interleave_transpose = !_run_vector_matrix_multiplication; + // Pick output tensor in case bias addition should be performed + ITensorInfo *gemm_output_to_use = (_run_bias_addition) ? &_tmp_d : d; + // Pick b tensor in case pretranspose should be performed + const ITensorInfo *b_to_use = b; + + _mm_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixMultiplyKernel>(); + + // Configure rhs pretranspose + if (gemm_info.pretranspose_B()) + { + _pretranspose_b_func = std::make_unique<CpuTranspose>(); + _pretranspose_b_func->configure(b_to_use, &_pretransposed_b); + MemoryLifetime lifetime; + if (_reshape_b_only_on_first_run) + { + if (_run_interleave_transpose) + { + // PreTransposedRHS tensor is only used in prepare(), but is then succeeded by Transposed1xWRHS + // So PreTransposedRHS can be freed inside prepare() + lifetime = MemoryLifetime::Prepare; + } + else + { + // PreTransposedRHS tensor is only used in prepare(), but is the final transformation of rhs + // So PreTransposedRHS needs to persist beyond prepare() + lifetime = MemoryLifetime::Persistent; + } + } + else + { + // PreTransposedRHS tensor is always used in run() and doesn't need to persist + lifetime = MemoryLifetime::Temporary; + } + _aux_mem[PreTransposedRHS] = + MemoryInfo(offset_int_vec(PreTransposedRHS), lifetime, _pretransposed_b.total_size()); + b_to_use = &_pretransposed_b; + } + + // Select between GEMV and GEMM + if (_run_vector_matrix_multiplication) + { + // Configure the matrix multiply kernel + _mm_kernel->configure(a, b_to_use, gemm_output_to_use, alpha, false); + } + else + { + ARM_COMPUTE_ERROR_ON(!_run_interleave_transpose); + // Configure interleave kernel + _interleave_kernel = std::make_unique<cpu::kernels::CpuGemmInterleave4x4Kernel>(); + _interleave_kernel->configure(a, &_tmp_a); + _aux_mem[InterleavedLHS] = + MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size()); + + // Configure rhs transpose1xw kernel + _transpose1xW_b_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>(); + _transpose1xW_b_kernel->configure(b_to_use, &_tmp_b); + _aux_mem[Transposed1xWRHS] = + MemoryInfo(offset_int_vec(Transposed1xWRHS), MemoryLifetime::Persistent, _tmp_b.total_size()); + + // Use a and b here instead of _tmp_a and _tmp_b because CpuGemmMatrixMultiplyKernel requires the original m,n,k in case of interleaved a and transposed1xw b + const int m = a->dimension(1); + const int n = b_to_use->dimension(0); + const int k = a->dimension(0); + + // Configure matrix multiplication kernel + _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, _run_interleave_transpose, + GEMMReshapeInfo(m, n, k)); + } + + if (_run_bias_addition) + { + _add_bias = std::make_unique<cpu::CpuAdd>(); + _add_bias->configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE); + _aux_mem[TempResult] = + MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size()); + } + } + + // Configure matrix addition kernel + if (_run_addition) + { + _ma_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixAdditionKernel>(); + _ma_kernel->configure(c, d, beta); + } + + // Configure activation + if (_run_activation) + { + _activation_func = std::make_unique<cpu::CpuActivation>(); + _activation_func->configure(d, nullptr, gemm_info.activation_info()); + } +} + +Status CpuGemm::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_UNUSED(alpha); + // When using accumulation(in place summation), for now, the only supported values for alpha and beta are 1 respectively 0. + // Do the appropriate checks before proceeding. + if (gemm_info.accumulate()) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(alpha != 1, "Accumulation is not supported when alpha is different from 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (beta != 0 && c != nullptr), + "Accumulation is not supported when beta is different from 0 with a non-null bias matrix c"); + } + + const bool is_c_bias = beta == 1 && c != nullptr; + const bool run_addition = c != nullptr && beta != 0 && beta != 1; + // Check if we should use the pretransposed_b or original b + // TODO: COMPMID-6597 + // Note that this check should only apply to the non-optimized path. The reason we brought this at the beginning + // instead of only for the fallback path is because of the checks performed below, between here and the run_optimised decision + // We should simplify this by + // 1. Moving the checks between "fix-start" and "fix-end" into their corresponding ops / kernels (e.g. the weights format checks can and should be moved into CpuGemmAssemblyDispatch) + // 2. Moving this b_to_use check back into the non-optimized path + TensorInfo pretransposed_b = b->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*b)); + const ITensorInfo *b_to_use = gemm_info.pretranspose_B() ? &pretransposed_b : b; + // TODO: COMPMID-6597 fix-start + + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::BFLOAT16, DataType::F16, DataType::F32); + + if (is_fixed_format_fast_math(gemm_info.weight_format())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b_to_use, DataType::BFLOAT16); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b_to_use); + } + + const int block_by = arm_compute::block_by(gemm_info.weight_format()); + // test if im2col has changed the dimensions that are needed for padding + if (a->dimension(0) != b_to_use->dimension(1) && block_by > 1) + { + // have to verify bias + const size_t dim0_sz = a->dimension(0); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (dim0_sz % block_by) != 0, + ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str()); + // a->dimension(0) = kernel_area * input_channel + kernel_area * input_pad_right + // b_to_use->dimension(1) = kernel_area * input_channel + // a->dimension(0) = b_to_use->dimension(1) + kernel_area * input_pad_right + const size_t input_pad_right = (dim0_sz - b_to_use->dimension(1)) % block_by; + const size_t kernel_area = (dim0_sz - b_to_use->dimension(1)) / input_pad_right; + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (dim0_sz - kernel_area * input_pad_right) != b_to_use->dimension(1), + "The product AB is defined only if A number of columns and B number of rows are related"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + a->dimension(0) != b_to_use->dimension(1), + "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); + if (a->data_type() != DataType::BFLOAT16) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, d); + } + + if (run_addition) + { + ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0); + ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, d); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), + "The C matrix must have the same number of rows as the matrix A"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(b_to_use->dimension(0) != c->dimension(0), + "The C matrix must have the same number of columns as the matrix B"); + } + + if (d->total_size() != 0) + { + // For fixed format we are expecting some kind of blocked format for B/RHS so the dimension won't necessarily match the result matrix any more. + ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.fixed_format() && b_to_use->dimension(0) != d->dimension(0)); + if (gemm_info.depth_output_gemm3d() != 0) + { + if (gemm_info.reinterpret_input_as_3d()) + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != d->dimension(2)); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1) * d->dimension(2)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1)); + } + } + // TODO: COMPMID-6597 fix-end + + // Check if we need to run the optimized assembly kernel + cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); + + // Note we use b instead of b_to_use here because asm_info also captures the pretranspose_b() flag + // so we pass the original b to CpuGemmAssemblyDispatch + const bool run_optimised = + bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) && + (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient. + !(!b->are_values_constant() && + b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently. + + if (!run_optimised) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), + "CpuGemm cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, + "CpuGemm cannot reinterpret the output tensor as 3D"); + + // Check if the first input tensor is a vector. + const bool run_vector_matrix_multiplication = a->dimension(1) < 2; + // Check if we need to reshape the matrix A and matrix B + const bool run_interleave_transpose = !run_vector_matrix_multiplication; + + // Arguments used by GEMMReshapeInfo + // If we pass the matrix A and matrix B reshaped to CpuGemmMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to GEMMReshapeInfo + // in order to know how the matrices have been reshaped + const int m = a->dimension(1); + const int n = b_to_use->dimension(0); + const int k = a->dimension(0); + int mult_transpose1xW_width = 1; + int mult_interleave4x4_height = 1; + + const GEMMReshapeInfo reshape_info = GEMMReshapeInfo( + m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d()); + + const ITensorInfo *matrix_a_info = a; + const ITensorInfo *matrix_b_info = b_to_use; + + TensorInfo tmp_a_info{}; + TensorInfo tmp_b_info{}; + TensorInfo tmp_output_info = *d->clone(); + + if (run_interleave_transpose) + { + matrix_a_info = &tmp_a_info; + matrix_b_info = &tmp_b_info; + + // Validate interleave kernel + auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape( + *a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d()))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmInterleave4x4Kernel::validate(a, &tmp_a_info)); + + // Validate transpose kernel + auto_init_if_empty(tmp_b_info, + b_to_use->clone()->set_tensor_shape( + compute_transpose1xW_with_element_size_shape(*b_to_use, mult_transpose1xW_width))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b_to_use, &tmp_b_info)); + } + + // Validate matrix multiply + auto_init_if_empty(tmp_output_info, + matrix_a_info->clone()->set_tensor_shape(compute_mm_shape( + *matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate( + matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info)); + + if (is_c_bias) + { + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuAdd::validate(&tmp_output_info, c, d, ConvertPolicy::SATURATE)); + } + } + + // Validate matrix addition kernel + if (run_addition) + { + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixAdditionKernel::validate(c, d, beta)); + } + + // Validate activation + const ActivationLayerInfo &activation = gemm_info.activation_info(); + if (activation.enabled()) + { + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuActivation::validate(d, nullptr, activation)); + } + + return Status{}; +} + +void CpuGemm::run(ITensorPack &tensors) +{ + prepare(tensors); + + auto a = tensors.get_const_tensor(ACL_SRC_0); + auto b = tensors.get_const_tensor(ACL_SRC_1); + auto c = tensors.get_const_tensor(ACL_SRC_2); + auto d = tensors.get_tensor(ACL_DST); + + if (_asm_glue && _asm_glue->is_configured()) + { + // Pass c to asm dispatch only if it's the bias tensor + ITensorPack asm_pack = tensors; + asm_pack.add_const_tensor(ACL_SRC_2, _run_bias_addition ? c : nullptr); + _asm_glue->run(asm_pack); + if (_run_alpha_scale) + { + ITensorPack pack{{ACL_SRC, d}, {ACL_DST, d}}; + _alpha_scale_func->run(pack); + } + } + else + { + CpuAuxTensorHandler interleaved_a(offset_int_vec(InterleavedLHS), _tmp_a, tensors, true); + CpuAuxTensorHandler pretransposed_b(offset_int_vec(PreTransposedRHS), _pretransposed_b, tensors); + CpuAuxTensorHandler transposed1xw_b(offset_int_vec(Transposed1xWRHS), _tmp_b, tensors, true); + CpuAuxTensorHandler temp_d(offset_int_vec(TempResult), _tmp_d, tensors, true); + + ITensorPack mm_pack{{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_DST, (_run_bias_addition) ? temp_d.get() : d}}; + + if (_run_interleave_transpose) + { + // Run interleave kernel + ITensorPack interleave_pack{{ACL_SRC, a}, {ACL_DST, interleaved_a.get()}}; + NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(), + interleave_pack); + // Use reshaped matrices + mm_pack.add_const_tensor(ACL_SRC_0, interleaved_a.get()); + } + + const ITensor *b_to_use = b; + if (_pretranspose_b_func) + { + if (!_reshape_b_only_on_first_run) + { + // Run pretranspose kernel + ITensorPack pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pretransposed_b.get()}}; + _pretranspose_b_func->run(pretranspose_pack); + } + b_to_use = pretransposed_b.get(); + } + if (_run_interleave_transpose) + { + if (!_reshape_b_only_on_first_run) + { + // Run transpose1xw kernel + ITensorPack transpose_pack{{ACL_SRC, b_to_use}, {ACL_DST, transposed1xw_b.get()}}; + NEScheduler::get().schedule_op(_transpose1xW_b_kernel.get(), Window::DimY, + _transpose1xW_b_kernel->window(), transpose_pack); + } + b_to_use = transposed1xw_b.get(); + } + // Use reshaped matrices + mm_pack.add_const_tensor(ACL_SRC_1, b_to_use); + + NEScheduler::get().schedule_op(_mm_kernel.get(), + _run_vector_matrix_multiplication ? Window::DimX : Window::DimY, + _mm_kernel->window(), mm_pack); + + // Run bias addition kernel + if (_run_bias_addition) + { + ITensorPack pack{{ACL_SRC_0, temp_d.get()}, {ACL_SRC_1, c}, {ACL_DST, d}}; + _add_bias->run(pack); + } + } + + // Run matrix addition kernel + if (_run_addition) + { + ITensorPack c_add_pack{{ACL_SRC, c}, {ACL_DST, d}}; + NEScheduler::get().schedule_op(_ma_kernel.get(), Window::DimY, _ma_kernel->window(), c_add_pack); + } + + // Run activation function + if (_run_activation) + { + ITensorPack pack{{ACL_SRC, d}, {ACL_DST, d}}; + _activation_func->run(pack); + } +} + +void CpuGemm::prepare(ITensorPack &tensors) +{ + if (!_is_prepared) + { + if (_asm_glue && _asm_glue->is_configured()) + { + _asm_glue->prepare(tensors); + } + else if (_reshape_b_only_on_first_run) + { + const ITensor *b = tensors.get_const_tensor(ACL_SRC_1); + const ITensor *b_to_use = b; + CpuAuxTensorHandler pretransposed_b( + offset_int_vec(PreTransposedRHS), _pretransposed_b, tensors, + false /*pack_inject: no need to inject into tensors*/, + _pretranspose_b_func == + nullptr /*bypass_alloc: no need to allocate if _pretranspose_b_func is not run*/); + CpuAuxTensorHandler transposed1xw_b(offset_int_vec(Transposed1xWRHS), _tmp_b, tensors, + false /*pack_inject*/, !_run_interleave_transpose /*bypass_alloc*/); + + if (_pretranspose_b_func) + { + // Run pretranspose kernel + ITensorPack pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pretransposed_b.get()}}; + _pretranspose_b_func->run(pretranspose_pack); + b_to_use = pretransposed_b.get(); + } + if (_run_interleave_transpose) + { + // Run transpose kernel + ITensorPack transpose_pack{{ACL_SRC, b_to_use}, {ACL_DST, transposed1xw_b.get()}}; + NEScheduler::get().schedule_op(_transpose1xW_b_kernel.get(), Window::DimY, + _transpose1xW_b_kernel->window(), transpose_pack); + } + } + _is_prepared = true; + } +} + +experimental::MemoryRequirements CpuGemm::workspace() const +{ + return _aux_mem; +} + +Status CpuGemm::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const GEMMInfo &gemm_info) +{ + const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); + + return CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, a, b, c, d, asm_info); +} + +bool CpuGemm::isVarWeightsKernel() const +{ + return _asm_glue && _asm_glue->isVarWeightsKernel(); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuGemm.h b/src/cpu/operators/CpuGemm.h new file mode 100644 index 0000000000..a05258d206 --- /dev/null +++ b/src/cpu/operators/CpuGemm.h @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_CPUGEMM_H +#define ACL_SRC_CPU_OPERATORS_CPUGEMM_H + +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/function_info/GEMMInfo.h" + +#include "src/cpu/ICpuOperator.h" +#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h" +#include "src/cpu/kernels/CpuGemmMatrixAdditionKernel.h" +#include "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h" +#include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h" +#include "src/cpu/operators/CpuActivation.h" +#include "src/cpu/operators/CpuAdd.h" +#include "src/cpu/operators/CpuTranspose.h" +#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h" + +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to execute GEMM. This function calls the following kernels: + * + * If optimized assembly is available: + * -# @ref cpu::CpuGemmAssemblyDispatch + * -# @ref cpu::CpuActivation (if alpha != 1.0) + * Else: + * -# @ref cpu::kernels::CpuGemmInterleave4x4Kernel (if the output tensor is a matrix) + * -# @ref cpu::kernels::CpuGemmTranspose1xWKernel (if the output tensor is a matrix) + * -# @ref cpu::kernels::CpuGemmMatrixMultiplyKernel + * In both cases: + * -# @ref cpu::kernels::CpuGemmMatrixAdditionKernel (if c != nullptr and beta != 0.0 and is not reshaped once) + * Else: + * -# @ref cpu::CpuAdd (if c != nullptr and is reshaped once and not optimized assembly in place) + * + * -# @ref cpu::CpuActivation (if activation is specified in GEMMInfo) + */ +class CpuGemm : public ICpuOperator +{ +public: + /** Default constructor */ + CpuGemm() = default; + /** Default destructor */ + ~CpuGemm() = default; + /** Configure operator for a given list of arguments + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:------------|:-----------|:---------|:--------------| + * |F32 |F32 |F32 |F32 | + * |F16 |F16 |F16 |F16 | + * |BFLOAT16 |BFLOAT16 |BFLOAT16 |FP32 | + * + * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C]. + * @note GEMM: The tensors a, b, c, d must have the same data type. You should not mix data types when calling this function. + * + * @note Batched GEMM only supports broadcasting cases where RHS rank < LHS rank but not the other way around + * + * @param[in] a First input tensor info (Matrix A or Vector A). Data type supported: BFLOAT16/F16/F32 + * @param[in] b Second input tensor info (Matrix B). Data type supported: same as @p a + * @param[in] c Third input tensor info (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a + * @param[out] d Output tensor info. Data type supported: same as @p a + * @param[in] alpha Weight of the matrix product + * @param[in] beta Weight of matrix C + * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and + * if the reshape of matrix B should happen only for the first run + */ + void configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info = GEMMInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref CpuGemm. + * + * Similar to @ref CpuGemm::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + float alpha, + float beta, + const GEMMInfo &gemm_info = GEMMInfo()); + + /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters. + * + * This method has the same use of @ref + * NEGEMMConvolutionLayer::has_opt_impl, with the only caveat that + * the value of arm_compute::WeightFormat need to be passed via the + * parameter gemm_info. + */ + static Status has_opt_impl(arm_compute::WeightFormat &weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const GEMMInfo &gemm_info = GEMMInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; + experimental::MemoryRequirements workspace() const override; + + /** Indicates if the convolution executes in variable weights mode. + * + * When ACL executes convolution in variable weights mode, it does + * not perform any processing of the weights tensor. Instead, it + * utilizes the data as it is given by the user. + */ + bool isVarWeightsKernel() const; + +private: + enum AuxTensorIdx + { + /* Slots 0 - 2 reserved for CpuGemmAssemblyDispatch */ + InterleavedLHS = 3, + PreTransposedRHS, + Transposed1xWRHS, + TempResult, + Count + }; + + std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel> _interleave_kernel{nullptr}; + std::unique_ptr<CpuTranspose> _pretranspose_b_func{nullptr}; + std::unique_ptr<kernels::CpuGemmTranspose1xWKernel> _transpose1xW_b_kernel{nullptr}; + std::unique_ptr<kernels::CpuGemmMatrixMultiplyKernel> _mm_kernel{nullptr}; + std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue{nullptr}; + std::unique_ptr<kernels::CpuGemmMatrixAdditionKernel> _ma_kernel{nullptr}; + std::unique_ptr<CpuActivation> _alpha_scale_func{nullptr}; + std::unique_ptr<CpuAdd> _add_bias{nullptr}; + std::unique_ptr<CpuActivation> _activation_func{nullptr}; + + TensorInfo _tmp_a{}; + TensorInfo _pretransposed_b{}; + TensorInfo _tmp_b{}; + TensorInfo _tmp_d{}; + + bool _run_vector_matrix_multiplication{false}; + bool _run_interleave_transpose{ + true}; /**< If we run CpuGemmInterleave4x4Kernel on lhs and CpuGemmTranspose1xWKernel on rhs */ + bool _run_alpha_scale{false}; + bool _run_addition{false}; + bool _run_bias_addition{false}; + bool _run_activation{false}; + bool _reshape_b_only_on_first_run{false}; + bool _is_prepared{false}; + + experimental::MemoryRequirements _aux_mem{Count}; +}; +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_OPERATORS_CPUGEMM_H diff --git a/src/cpu/operators/CpuGemmConv2d.cpp b/src/cpu/operators/CpuGemmConv2d.cpp new file mode 100644 index 0000000000..55d950ff4a --- /dev/null +++ b/src/cpu/operators/CpuGemmConv2d.cpp @@ -0,0 +1,992 @@ +/* + * Copyright (c) 2021-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuGemmConv2d.h" + +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/helpers/Utils.h" +#include "src/cpu/kernels/CpuCol2ImKernel.h" +#include "src/cpu/kernels/CpuIm2ColKernel.h" +#include "src/cpu/kernels/CpuWeightsReshapeKernel.h" +#include "src/cpu/operators/CpuGemm.h" +#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h" +#include "src/cpu/operators/CpuGemmLowpOutputStage.h" +#include "src/cpu/operators/CpuReshape.h" +#include "src/cpu/utils/CpuAuxTensorHandler.h" + +#include <set> +#include <tuple> + +using namespace arm_compute::misc::shape_calculator; +using namespace arm_compute::experimental; + +namespace arm_compute +{ +namespace cpu +{ + +/** @section note_CpuGemmConv2d_weight_transformation Weight Transformations in CpuGemmConv2d + * + * A. Terminology + * Throughout CpuGemmConv2d, we use the following terms in ways that may differ from other operators / kernels: + * - "Transform" or "Reshape" of the weights: they both mean all the operations that we perform on the weight + * tensor up until they are consumed by gemm (CpuGemm or CpuGemmLowpMatrixMultiplyCore) + * Note that the specific gemm operator may perform further transformations on the weights, but the + * transformations here only mean those performed in CpuGemmConv2d + * - "Transpose" of weights: The @ref CpuTranspose operation. I.e. transpose of the weights' lowest two + * dimensions + * + * B. Gemm-based conv2d + * We want to convert the 2d convolution op (ignoring bias): + * dst = conv2d(src, weight) + * into a matrix multiplication op: + * gemm_dst = gemm(lhs, rhs) + * + * E.g.: For data layout NHWC + * 3 (hi) <----------> (lo) 0 + * src.shape = [batch, in_h , in_w, in_c] + * weight.shape = [out_c, k_h , k_w, in_c] + * dst.shape = [batch, out_h, out_w, out_c] + * + * This requires three transformations: + * * src -> lhs, transform conv input to gemm lhs; gemm_lhs is a 2d matrix where each row (or column, + * depending on the convention) is a linearized "patch" of the conv_input that corresponds to + * the receptive field of the corresponding output element. + * The convention is to use "column", but to disambiguate from the column vector of a matrix, + * in this documentation we shall use "patch". + * This transform is called im2col (for details see @ref CpuIm2ColKernel) + * * weight -> rhs, transform conv weight to gemm rhs, known as weight transform/reshape (wt) + * * gemm_dst -> dst, transform gemm output back to conv output, known as col2im (for details see + * @ref CpuCol2ImKernel) + * + * This section focuses on the weight transformation and assumes the im2col is already performed + * + * C. Weight Transformation + * After im2col, assume: lhs.shape = [num_patch, patch_size], + * where patch_size is the number of elements in a "patch": patch_size = k_h * k_w * in_c + * num_patch is the number of patches; we can ignore it here (for details see @ref CpuIm2ColKernel) + * + * After wt, rhs should have the shape: rhs = [patch_size, out_c] + * + * Therefore, the weight transformation consists of two steps: + * 1. Collapsing all 3 spatial dimensions: [out_c, k_h, k_w, in_c] -> [out_c, patch_size] + * 2. Transpose the collapsed shape: [out_c, patch_size] -> [patch_size, out_c] + * + * D. Implementation + * There are 4 paths for weight transformation + * + * 1. Path 1: Fixed weight format - no transformation + * The underlying gemm kernel may adopt fixed weight format (isVarWeightsKernel() == true), which requires + * that no weight transformation shall be performed + * Note that this no-transform requirement applies both to this op (CpuGemmConv2d) and the constituent ops, up + * until the fixed format kernels themselves + * + * 2. Path 2: Reinterpret then transpose later + * If the weight tensor has no "holes" (see @ref has_holes), there are two optimizations we can apply: + * - We can ignore the first step (collapsing of spatial dimensions) by simply re-interpreting the shape + * in TensorInfo + * - Instead of performing transpose here, we can pass the transpose flag to the underlying gemm. The gemm + * may then decide to fuse the transpose with any further transformations + * + * 3. Path 3: Reshape then transpose later + * If the weight tensor has holes, then we use a dedicated @ref CpuReshape, followed by transpose later + * + * 4. Path 4: Fused reshape and transpose + * This is only for quantized types for now (TODO: Remove (COMPMID-6596)). We fall back to a legacy + * non-optimized kernel @ref CpuWeightsReshapeKernel to perform a fused reshape + transpose + * + * Path 1 is the long term solution that we shall migrate to once (if) we adopt fixed weight format for all gemm + * kernels. + * In the short term, Path 2 is the favored, more performant path. + */ + +namespace +{ +/** Initialize reshaped / transformed weight info + * + * @param[in] weights Input weights + * @param[out] reshaped_weights Transformed weights + */ +void initialize_reshaped_weight_info(const ITensorInfo &weights, ITensorInfo &reshaped_weights) +{ + auto_init_if_empty(reshaped_weights, weights); + if (is_data_type_quantized(weights.data_type())) + { + // WT method: FusedReshapeAndTranspose + reshaped_weights.set_tensor_shape(compute_weights_reshaped_shape(weights, /* has_bias */ false)); + } + else + { + TensorShape collapsed_weights = weights.tensor_shape(); + collapsed_weights.collapse(3); + reshaped_weights.set_tensor_shape(collapsed_weights); + } +} +} // namespace + +CpuGemmConv2d::WeightTransformMethod CpuGemmConv2d::get_wt_method(const ITensorInfo &weights) +{ + // TODO: Extend ReinterpretThenTranspose support for quantized data types COMPMID-6596 + if (is_data_type_quantized(weights.data_type())) + { + return WeightTransformMethod::FusedReshapeAndTranspose; + } + return has_holes(weights) ? WeightTransformMethod::ReshapeThenTranspose + : WeightTransformMethod::ReinterpretThenTranspose; +} + +CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo *src, + const ITensorInfo *weights, + const PadStrideInfo &conv_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info) +{ + const DataLayout data_layout = src->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const unsigned int kernel_width = weights->dimension(idx_width); + const unsigned int kernel_height = weights->dimension(idx_height); + unsigned int conv_w = 0; + unsigned int conv_h = 0; + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv_info, dilation); + const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && + conv_info.stride().first == 1 && conv_info.stride().second == 1); + + if (skip_im2col) + { + const bool skip_col2im = + (data_layout == DataLayout::NHWC && + (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ true)))); + if (skip_col2im) + { + return {true, true}; + } + } + else + { + const bool skip_col2im = + (data_layout == DataLayout::NHWC && + (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ false)))); + if (skip_col2im) + { + return {false, true}; + } + } + + // Default case when we cannot reinterpret the input and output as 3D. + return {false, false}; +} + +CpuGemmConv2d::CpuGemmConv2d() + : _weights_reshape(nullptr), + _weights_reshape_and_transpose_kernel(nullptr), + _im2col_kernel(), + _mm_gemm(), + _mm_gemmlowp(), + _col2im_kernel(), + _reshape(), + _im2col_output(), + _weights_reshaped(), + _gemm_output(), + _gemm_output_3d(), + _data_layout(DataLayout::NCHW), + _skip_im2col(false), + _skip_col2im(false), + _is_quantized(false), + _is_prepared(false), + _wt_method(WeightTransformMethod::ReshapeThenTranspose), + _run_wt(true), + _aux_mem(AuxTensorIdx::Count) +{ +} +CpuGemmConv2d::~CpuGemmConv2d() = default; + +void CpuGemmConv2d::configure_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + int gemm_3d_depth, + bool fixed_format, + arm_compute::WeightFormat weight_format) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights); + ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth, + _skip_im2col, fixed_format, weight_format)); + + // Supported activations in GEMM + const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; + + if (_is_quantized) + { + TensorInfo tmp_src{*src}; + TensorInfo tmp_weights{*weights}; + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo iqinfo = src->quantization_info(); + const QuantizationInfo wqinfo = weights->quantization_info(); + const QuantizationInfo oqinfo = (dst->total_size() == 0) ? iqinfo : dst->quantization_info(); + const UniformQuantizationInfo uiqinfo = iqinfo.uniform(); + const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); + const DataType data_type = src->data_type(); + + tmp_src.set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset)); + if (!is_data_type_quantized_per_channel(tmp_weights.data_type())) + { + const UniformQuantizationInfo uwqinfo = wqinfo.uniform(); + tmp_weights.set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset)); + } + + // Merge activation with output stage + PixelValue type_min{}; + PixelValue type_max{}; + std::tie(type_min, type_max) = get_min_max(data_type); + int32_t min_activation = type_min.get<int32_t>(); + int32_t max_activation = type_max.get<int32_t>(); + + if (supported_acts.count(act_info.activation()) != 0) + { + std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo); + } + + GEMMLowpOutputStageInfo output_info; + output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + output_info.gemmlowp_offset = uoqinfo.offset; + output_info.gemmlowp_min_bound = min_activation; + output_info.gemmlowp_max_bound = max_activation; + output_info.is_quantized_per_channel = (tmp_weights.data_type() == DataType::QSYMM8_PER_CHANNEL); + quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info); + + _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>(); + _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, + GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, + enable_fast_math, false, act_info, fixed_format, weight_format, + false /* pretranspose_B. TODO: COMPMID-6596 */)); + + auto mm_mem_req = _mm_gemmlowp->workspace(); + for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) + { + _aux_mem[cont] = mm_mem_req[cont]; + } + } + else + { + // Create GEMMInfo structure + const GEMMInfo &gemm_info = + GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, + _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, + GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format, + true /*pretranspose_B. For fp gemm (wt path 1 - 3), We always pretranspose B (for wt path 1 this + flag is ignored)*/); + // Configure matrix multiply function + _mm_gemm = std::make_unique<CpuGemm>(); + _mm_gemm->configure(src, weights, biases, dst, 1.0f, 1.0f, gemm_info); + auto mm_mem_req = _mm_gemm->workspace(); + for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) + { + _aux_mem[cont] = mm_mem_req[cont]; + } + } +} + +Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + int gemm_3d_depth, + bool skip_im2col, + bool fixed_format, + arm_compute::WeightFormat weight_format) +{ + const DataType data_type = src->data_type(); + const bool is_quantized = is_data_type_quantized_asymmetric(data_type); + const bool is_activation_enabled = act_info.enabled(); + + if (is_quantized) + { + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo &iqinfo = src->quantization_info(); + const QuantizationInfo &wqinfo = weights->quantization_info(); + const QuantizationInfo &oqinfo = (dst->total_size() == 0) ? iqinfo : dst->quantization_info(); + const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); + + // Merge activation with output stage + PixelValue type_min{}; + PixelValue type_max{}; + std::tie(type_min, type_max) = get_min_max(data_type); + int32_t min_activation = type_min.get<int32_t>(); + int32_t max_activation = type_max.get<int32_t>(); + + const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; + if (is_activation_enabled && supported_acts.count(act_info.activation()) != 0) + { + std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo); + } + + GEMMLowpOutputStageInfo output_info; + output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + output_info.gemmlowp_offset = uoqinfo.offset; + output_info.gemmlowp_min_bound = min_activation; + output_info.gemmlowp_max_bound = max_activation; + output_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info)); + + // Perform validation step on GEMMLowp + std::unique_ptr<ITensorInfo> input_qa = src->clone(); + std::unique_ptr<ITensorInfo> weights_qa = weights->clone(); + input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset)); + weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset)); + + return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, + GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, + output_info, false, enable_fast_math, false, act_info, + false /* pretranspose_B. TODO: COMPMID-6596 */)); + } + else + { + // Create GEMMInfo structure + const GEMMInfo gemm_info = + GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, + skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, + GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format, + true /*pretranspose_B. For fp gemm (wt path 1 - 3), We always pretranspose B (for wt path 1 this + flag is ignored)*/); + + // Perform validation step on Matrix multiply function + return CpuGemm::validate(src, weights, biases, dst, 1.0f, 1.0f, gemm_info); + } +} + +Status CpuGemmConv2d::validate_gemm3d(const ITensorInfo *input_info, + const ITensorInfo *weights_info, + const ActivationLayerInfo &act_info, + int gemm_3d_depth, + bool skip_im2col) +{ + const DataType data_type = input_info->data_type(); + const unsigned int mult_y = skip_im2col ? 1U : gemm_3d_depth; + const unsigned int mult_z = skip_im2col ? gemm_3d_depth : 1U; + + // Set dummy tensor shapes for the validation + const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, + input_info->quantization_info()); + const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info()); + const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, + input_info->quantization_info()); + + return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, false, + gemm_3d_depth, skip_im2col); +} + +void CpuGemmConv2d::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_UNUSED(num_groups, weights_info); + ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConv2d::validate(src, weights, biases, dst, conv_info, weights_info, dilation, + act_info, enable_fast_math, num_groups)); + ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math, + num_groups); + + const DataType data_type = src->data_type(); + const DataLayout data_layout = src->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); + + const unsigned int kernel_width = weights->dimension(idx_width); + const unsigned int kernel_height = weights->dimension(idx_height); + + _is_prepared = weights_info.retain_internal_weights(); + _is_quantized = is_data_type_quantized_asymmetric(src->data_type()); + _data_layout = data_layout; + _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && + conv_info.stride().first == 1 && conv_info.stride().second == 1); + + const ITensorInfo *gemm_input_to_use = src; + ITensorInfo *gemm_output_to_use = dst; + + // Get convolved dimensions + unsigned int conv_w = 0; + unsigned int conv_h = 0; + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv_info, dilation); + + ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h), + "Output shape does not match the expected one"); + + // Check if GEMM3D is supported + const CpuGemmConv2d::SkipInfo skip_info = + CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info); + _skip_im2col = skip_info.skip_im2col; + _skip_col2im = skip_info.skip_col2im; + + // Get parameters from conv_info + unsigned int stride_x = 0; + unsigned int stride_y = 0; + std::tie(stride_x, stride_y) = conv_info.stride(); + + // Initialize reshaped weights + initialize_reshaped_weight_info(*weights, _weights_reshaped); + + // Create tensor to store im2col reshaped inputs + if (!_skip_im2col) + { + const int block_by = arm_compute::block_by(weights_info.weight_format()); + unsigned int input_pad_right = 0; + if (block_by > 1) + { + input_pad_right = + (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by); + } + // Configure + _im2col_kernel = std::make_unique<kernels::CpuIm2ColKernel>(); + _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation, + num_groups, input_pad_right); + + // Update GEMM input + gemm_input_to_use = &_im2col_output; + } + + const unsigned int mat_weights_cols = weights->dimension(idx_kernels); + + // Create temporary GEMM output tensor in case we cannot skip col2im + const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type; + if (!_skip_col2im) + { + TensorShape shape_gemm; + + // Calculate GEMM output shape + shape_gemm = _im2col_output.tensor_shape(); + shape_gemm.set(0, mat_weights_cols); + shape_gemm.set(1, conv_w * conv_h); + + _gemm_output = TensorInfo(shape_gemm, 1, output_data_type); + _gemm_output.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout()); + _gemm_output_3d = TensorInfo(_gemm_output); + + // Update GEMM output + gemm_output_to_use = &_gemm_output; + } + else + { + _gemm_output_3d = TensorInfo(*dst); + _gemm_output_3d.set_data_type(output_data_type).set_data_layout(src->data_layout()).set_is_resizable(true); + _gemm_output = TensorInfo(_gemm_output_3d); + + // Update GEMM output + gemm_output_to_use = &_gemm_output_3d; + } + + // Configure GEMM + // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix + const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0; + const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED; + /** @section note_CpuGemmConv2d_weight_use_in_configure Which weights tensor should we use to configure gemm + * + * A. The problem: + * In principle, we should use the weights tensor corresponding to the weights transformation path. I.e.: + * - If no weight transformation (_run_wt == false): Use original weights + * - else: Use transformed weights + * However in practice we have a dilemma: + * - We need to know _run_wt before we can configure gemm with the corresponding weights, but + * - _run_wt depends on isVarWeightsKernel(), which is only known after gemm is configured + * + * B. The decision: + * To simplify the matter, we decide to always use the transformed weights, regardless of _run_wt + * + * This decision requires the following conditions: + * 1. The underlying gemm where isVarWeightsKernel() == true, must guarantee that: + * A. Ignore the flag to transpose weights (GEMMInfo::pretranspose_B) + * B. Use weights/B tensor passed to it at prepare() or run() instead of that passed at configure() + * 2. CpuGemmConv2d where isVarWeightsKernel() == true, must guarantee that: + * A. Pass original weights instead of reshaped or reinterpreted weights + * + * C. Future actions: + * Condition 2 is a given, based on our implementation. + * If condition 1 cannot hold, we must make changes to the underlying gemm to: + * 1. Either expose isVarWeightsKernel() before gemm is configured somehow, or + * 2. Take in an additional "original_weights" tensor info at configure + */ + configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math, + gemm_3d_depth, fixed_format, weights_info.weight_format()); + + // Can only decide isVarWeightsKernel after gemm is configured + _run_wt = !isVarWeightsKernel(); + + if (!_skip_col2im && _data_layout == DataLayout::NCHW) + { + // Configure col2im + _col2im_kernel = std::make_unique<kernels::CpuCol2ImKernel>(); + _col2im_kernel->configure(gemm_output_to_use, dst, Size2D(conv_w, conv_h)); + } + else + { + // Configure reshape layer + _reshape = std::make_unique<CpuReshape>(); + _reshape->configure(gemm_output_to_use, dst); + } + + // Check lifetime + _aux_mem[Im2ColOutput] = + MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size()); + // Add WeightsReshaped memory requirement to workspace + // Note that in case of WeightTransformMethod::ReinterpretThenTranspose, we do not need to allocate this memory + // However since we cannot determine weight transformation method until prepare (see prepare()), we will have to + // settle with allocating more + if (_run_wt) + { + // Check if GEMM transforms weights + // If weight is further transformed by underlying gemm after ReshapeThenTranspose then we can free + // WeightsReshaped in prepare + // Otherwise WeightsReshaped is the final transformation of weights and needs to persist + bool gemm_trans_wei = _aux_mem[GemmAsmPretransposedRHS].size > 0; + gemm_trans_wei = _mm_gemm != nullptr ? _aux_mem[GemmTransposed1xWRHS].size > 0 : gemm_trans_wei; + gemm_trans_wei = _mm_gemmlowp != nullptr ? _aux_mem[GemmLowpTransposed1xWRHS].size > 0 : gemm_trans_wei; + + _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), + gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, + _weights_reshaped.total_size()); + } + _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size()); +} + +Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + const bool enable_fast_math) +{ + const DataLayout data_layout = src->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const unsigned int kernel_width = weights->dimension(idx_width); + const unsigned int kernel_height = weights->dimension(idx_height); + unsigned int conv_w = 0; + unsigned int conv_h = 0; + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv_info, dilation); + + const CpuGemmConv2d::SkipInfo skip_info = + CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info); + + const bool skip_im2col = skip_info.skip_im2col; + const bool skip_col2im = skip_info.skip_col2im; + const unsigned int gemm_3d_depth = skip_col2im ? conv_h : 0; + const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED; + + /** @section note_CpuGemmConv2d_weight_use_in_has_opt_impl Which weights tensor should we use for has_opt_impl + * + * For the pretranspose_B flag, this shares a similar problem and thus the same decision as that of + * @ref note_CpuGemmConv2d_weight_use_in_configure + * + * But for the weights, we shall always use the original instead of reshaped weights here + */ + const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, + skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, + GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, + fixed_format, weights_info.weight_format(), true /* pretranspose_B */); + + return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info); +} + +Status CpuGemmConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + unsigned int num_groups) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::BFLOAT16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, + DataType::F16, DataType::F32); + + if (!is_fixed_format(weights_info.weight_format())) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported"); + + const DataLayout data_layout = src->data_layout(); + const DataType data_type = src->data_type(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); + + const unsigned int kernel_width = weights->dimension(idx_width); + const unsigned int kernel_height = weights->dimension(idx_height); + + TensorInfo im2col_reshaped_info{}; + TensorInfo info_gemm{}; + TensorInfo tmp_info{}; + TensorInfo weights_reshaped_info{}; + const ITensorInfo *gemm_input_to_use = src; + const ITensorInfo *gemm_output_to_use = dst; + const ITensorInfo *weights_to_use = weights; + + const bool append_bias = false; + const bool is_quantized = is_data_type_quantized_asymmetric(data_type); + const bool is_bf16 = data_type == DataType::BFLOAT16; + + // Get convolved dimensions + unsigned int conv_w = 0; + unsigned int conv_h = 0; + + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv_info, dilation); + + // Check if GEMM3D is supported + const CpuGemmConv2d::SkipInfo skip_info = + CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info); + const bool skip_im2col = skip_info.skip_im2col, skip_col2im = skip_info.skip_col2im; + + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != src->dimension(idx_channel)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); + + // Validate biases + if (biases != nullptr) + { + if (is_quantized) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); + } + else if (is_bf16) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); + } + ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != dst->dimension(idx_channel)); + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); + } + + unsigned int mat_weights_cols = weights->dimension(idx_kernels); + unsigned int mat_weights_rows = + weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel); + + // Initialize reshaped weights + initialize_reshaped_weight_info(*weights, weights_reshaped_info); + // No need to call CpuReshape::validate() or CpuTranspose::validate() as the dst info is auto-configured from the + // src + weights_to_use = &weights_reshaped_info; + + if (!skip_im2col) + { + const int block_by = arm_compute::block_by(weights_info.weight_format()); + int input_pad_right = 0; + if (block_by > 1) + { + input_pad_right = + (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by); + mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * + (weights->dimension(idx_channel) + input_pad_right); + } + + // Create tensor info for im2col reshaped inputs + // For CPU, the batch size is on the fourth dimension + TensorShape shape_im2col = src->tensor_shape(); + shape_im2col.set(0, mat_weights_rows); + shape_im2col.set(1, conv_w * conv_h); + shape_im2col.set(2, 1); + + im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type); + im2col_reshaped_info.set_quantization_info(src->quantization_info()); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), + conv_info, append_bias, dilation, num_groups, input_pad_right)); + gemm_input_to_use = &im2col_reshaped_info; + } + + // Create temporary GEMM output tensor in case we cannot skip col2im + const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type; + if (!skip_col2im) + { + TensorShape shape_gemm = gemm_input_to_use->tensor_shape(); + shape_gemm.set(0, mat_weights_cols); + shape_gemm.set(1, conv_w * conv_h); + info_gemm = TensorInfo(shape_gemm, 1, output_data_type); + } + else + { + info_gemm = TensorInfo(dst->tensor_shape(), 1, output_data_type); + } + info_gemm.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout()); + gemm_output_to_use = &info_gemm; + const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED; + + // See note_CpuGemmConv2d_weight_use_in_configure regarding the choice of the weights + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, + enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col, fixed_format, + weights_info.weight_format())); + + // Validate Col2Im/ReshapeLayer + if (!skip_col2im && (data_layout == DataLayout::NCHW)) + { + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h))); + } + + return Status{}; +} + +void CpuGemmConv2d::run(ITensorPack &tensors) +{ + prepare(tensors); + + auto src = tensors.get_const_tensor(ACL_SRC_0); + auto dst = tensors.get_tensor(ACL_DST); + auto gemm_input_to_use = src; + + CpuAuxTensorHandler im2col_output(offset_int_vec(Im2ColOutput), _im2col_output, tensors, false); + CpuAuxTensorHandler gemm_output(offset_int_vec(GemmOutput), _gemm_output, tensors, false); + + bool out_has_padding = _skip_col2im && (dst->info()->padding().bottom != 0 || dst->info()->padding().top != 0); + if (!_skip_im2col) + { + // Run input reshaping + unsigned int hint_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + unsigned int x_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + unsigned int hint_dim_iterations = _im2col_kernel->window().num_iterations(hint_dim); + unsigned int x_dim_iterations = _im2col_kernel->window().num_iterations(x_dim); + if (hint_dim_iterations < NEScheduler::get().num_threads() && x_dim_iterations > hint_dim_iterations) + { + hint_dim = x_dim; + } + ITensorPack pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, im2col_output.get()}}; + NEScheduler::get().schedule_op(_im2col_kernel.get(), hint_dim, _im2col_kernel->window(), pack); + gemm_input_to_use = im2col_output.get(); + } + + // Handle the case where output has top/bottom padding + const ITensor *out_to_use = out_has_padding ? gemm_output.get() : dst; + Tensor gemm3d; + _gemm_output_3d.extend_padding(out_to_use->info()->padding()); + gemm3d.allocator()->soft_init(_gemm_output_3d); + gemm3d.allocator()->import_memory(out_to_use->buffer()); + auto gemm_output_to_use = gemm_output.get(); + + if (_skip_im2col) + { + gemm_output_to_use = &gemm3d; + } + if (_skip_col2im && !out_has_padding) + { + gemm_output_to_use = dst; + } + + ITensorPack gemm_pack = tensors; + gemm_pack.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use); + gemm_pack.add_tensor(TensorType::ACL_DST, gemm_output_to_use); + // Allocate reshaped weights if required + auto weights = gemm_pack.get_const_tensor(TensorType::ACL_SRC_1); + ARM_COMPUTE_ERROR_ON_NULLPTR(weights); + // Re-interpreted weights. Only tensor shape is changed. Only memory import, no allocation + const bool use_reinterpreted_wei = (_run_wt && _wt_method == WeightTransformMethod::ReinterpretThenTranspose); + CpuAuxTensorHandler reinterpreted_wei( + _weights_reshaped, *weights, + /* import only if we chose the ReinterpretThenTranspose path, because otherwise the weight may have been freed */ + !use_reinterpreted_wei); + + const bool use_reshaped_wei = (_run_wt && (_wt_method == WeightTransformMethod::ReshapeThenTranspose || + _wt_method == WeightTransformMethod::FusedReshapeAndTranspose)); + CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, + false /* pack_inject */, !use_reshaped_wei /* bypass_alloc */, + !use_reshaped_wei /* bypass_import */ + ); + // Update the weights to use if it has been reshaped + if (use_reinterpreted_wei) + { + gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reinterpreted_wei.get()); + } + else if (use_reshaped_wei) + { + gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get()); + } + + // Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions + _is_quantized ? _mm_gemmlowp->run(gemm_pack) : _mm_gemm->run(gemm_pack); + + // Reshape output matrix + if (!_skip_col2im) + { + if (_data_layout == DataLayout::NCHW) + { + ITensorPack pack = {{TensorType::ACL_SRC, gemm_output.get()}, {TensorType::ACL_DST, dst}}; + NEScheduler::get().schedule_op(_col2im_kernel.get(), Window::DimY, _col2im_kernel->window(), pack); + } + else + { + ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}}; + _reshape->run(pack); + } + } + else if (out_has_padding) + { + ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}}; + _reshape->run(pack); + } +} + +void CpuGemmConv2d::prepare(ITensorPack &tensors) +{ + if (!_is_prepared) + { + auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + // Determine which weights reshape path to take + // Note that this decision can only occur at prepare instead of configure because it relies on the presence of + // any holes in the weight tensor, which may change after configure (e.g. from extending padding) + if (_run_wt) + { + _wt_method = get_wt_method(*(weights->info())); + switch (_wt_method) + { + case (WeightTransformMethod::FusedReshapeAndTranspose): + { + ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Perform weight transformation: FusedReshapeAndTranspose"); + _weights_reshape_and_transpose_kernel = std::make_unique<kernels::CpuWeightsReshapeKernel>(); + _weights_reshape_and_transpose_kernel->configure(weights->info(), nullptr, &_weights_reshaped); + break; + } + case (WeightTransformMethod::ReshapeThenTranspose): + { + ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Perform weight transformation: ReshapeThenTranspose"); + _weights_reshape = std::make_unique<CpuReshape>(); + _weights_reshape->configure(weights->info(), &_weights_reshaped); + break; + } + case (WeightTransformMethod::ReinterpretThenTranspose): + { + ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Perform weight transformation: ReinterpretThenTranspose"); + // Nothing to configure + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported weight transform method"); + } + } + } + else + { + ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("No weight transformation is performed"); + } + ITensorPack gemm_pack = tensors; + // Allocate reshaped weights if required + CpuAuxTensorHandler reinterpreted_wei( + _weights_reshaped, + *weights); // Re-interpreted weights. Only tensor shape is changed. No allocation + CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors); + // Run weights reshape if required + if (_run_wt) + { + switch (_wt_method) + { + case (WeightTransformMethod::FusedReshapeAndTranspose): + { + ITensorPack pack = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, reshaped_wei.get()}}; + NEScheduler::get().schedule_op(_weights_reshape_and_transpose_kernel.get(), Window::DimW, + _weights_reshape_and_transpose_kernel->window(), pack); + weights->mark_as_unused(); + gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get()); + break; + } + case (WeightTransformMethod::ReshapeThenTranspose): + { + ITensorPack pack = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, reshaped_wei.get()}}; + _weights_reshape->run(pack); + weights->mark_as_unused(); + gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get()); + break; + } + case (WeightTransformMethod::ReinterpretThenTranspose): + { + gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reinterpreted_wei.get()); + // Nothing to run + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported weight transform method"); + } + } + } + _is_quantized ? _mm_gemmlowp->prepare(gemm_pack) : _mm_gemm->prepare(gemm_pack); + + _is_prepared = true; + } +} +experimental::MemoryRequirements CpuGemmConv2d::workspace() const +{ + return _aux_mem; +} +bool CpuGemmConv2d::isVarWeightsKernel() const +{ + return _mm_gemm && _mm_gemm->isVarWeightsKernel(); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuGemmConv2d.h b/src/cpu/operators/CpuGemmConv2d.h new file mode 100644 index 0000000000..48a0d11107 --- /dev/null +++ b/src/cpu/operators/CpuGemmConv2d.h @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_CPUGEMMCONV2D_H +#define ACL_SRC_CPU_OPERATORS_CPUGEMMCONV2D_H + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/cpu/ICpuOperator.h" + +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +class CpuGemm; +class CpuGemmLowpMatrixMultiplyCore; +class CpuGemmLowpOutputStage; +class CpuReshape; +namespace kernels +{ +class CpuIm2ColKernel; +class CpuCol2ImKernel; +class CpuWeightsReshapeKernel; +} // namespace kernels + +/** Basic function to compute the convolution layer. @ref note_CpuGemmConv2d_weight_transformation */ +class CpuGemmConv2d : public ICpuOperator +{ +public: + /** Constructor */ + CpuGemmConv2d(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuGemmConv2d(const CpuGemmConv2d &) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + CpuGemmConv2d(CpuGemmConv2d &&) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CpuGemmConv2d &operator=(const CpuGemmConv2d &) = delete; + /** Prevent instances of this class from being moved (As this class contains non movable objects) */ + CpuGemmConv2d &operator=(CpuGemmConv2d &&) = delete; + /** Destructor */ + ~CpuGemmConv2d(); + /** Set the input and output tensors. + * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:--------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |BFLOAT16 |BFLOAT16 |BFLOAT16 |BFLOAT16 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | + * + * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. + * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * @param[out] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with CpuWeightsReshapeKernel. If this is not part of the fully connected layer the weights + * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported + */ + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmConvolution::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + unsigned int num_groups = 1); + + /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters. + * + * The parameter list is the same as @ref NEGEMMConvolutionLayer::has_opt_impl + * + * @return a status. + */ + static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &conv_info, + const WeightsInfo &weights_info = WeightsInfo(), + const Size2D &dilation = Size2D(1U, 1U), + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + const bool enable_fast_math = false); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + /** Configures the appropriate matrix multiply routine + * + * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. + * @param[in] weights Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * @param[out] dst Output tensor info. Data types supported: Same as @p input, + * except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1) + * @param[in] fixed_format (Optional) Select GEMM execution with variable weights. + * @param[in] weight_format (Optional) The layout to be used for the weights tensor when running GEMM with variable weights. + */ + void configure_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *output, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + int gemm_3d_depth = 1, + bool fixed_format = false, + arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines + * + * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. + * @param[in] weights Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * @param[in] dst Output tensor info. Data types supported: Same as @p input, + * except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1) + * @param[in] skip_im2col (Optional) Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout. (Default to false) + * @param[in] fixed_format (Optional) Select GEMM execution with variable weights. + * @param[in] weight_format (Optional) The layout to be used for the weights tensor when running GEMM with variable weights. + * + * @return a status + */ + static Status validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false, + int gemm_3d_depth = 1, + bool skip_im2col = false, + bool fixed_format = false, + arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED); + /** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref CpuGemmMLowpMatrixMultiplyCore + * + * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. + * @param[in] weights Weights tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. + * @param[in] act_info Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. + * @param[in] gemm_3d_depth Depth of GEMM 3D + * @param[in] skip_im2col Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout + * + * @return a status + */ + static Status validate_gemm3d(const ITensorInfo *src, + const ITensorInfo *weights, + const ActivationLayerInfo &act_info, + int gemm_3d_depth, + bool skip_im2col); + + struct SkipInfo + { + bool skip_im2col; + bool skip_col2im; + }; + + /** Static function to provide skip_im2col and skip_col2im information. + * + * @param[in] src Input tensor info. + * @param[in] weights Weights tensor info. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] dilation Dilation, in elements, across x and y. + * @param[in] act_info Activation layer information in case of a fused activation. + * + * @return a SkipInfo instance. + */ + static SkipInfo skip_im_col_info(const ITensorInfo *src, + const ITensorInfo *weights, + const PadStrideInfo &conv_info, + const Size2D &dilation, + const ActivationLayerInfo &act_info); + + /** Indicates if the convolution executes in variable weights mode. + * + * Similar to @ref CpuGemm::isVarWeightsKernel + */ + bool isVarWeightsKernel() const; + enum AuxTensorIdx + { + GemmAsmPretransposedRHS = 2, // CpuGemmAssemblyDispatch::Pretranspose + GemmTransposed1xWRHS = 5, // CpuGemm::Transposed1xWRHS + GemmLowpTransposed1xWRHS = 6, // CpuGemmLowpMatrixMultiplyCore::TmpB + /* Slots 0 - 9 reserved and shared by CpuGemmLowpMatrixMultiplyCore and CpuGemm */ + Im2ColOutput = 10, + WeightsReshaped, + GemmOutput, + Count + }; + + /** Weight transformation method. See @ref note_CpuGemmConv2d_weight_transformation */ + enum class WeightTransformMethod + { + ReinterpretThenTranspose, + ReshapeThenTranspose, + FusedReshapeAndTranspose, + }; + + /** Select weight transformation method + * + * @param[in] weights Input weights + * + * @return WeightTransformMethod + */ + static WeightTransformMethod get_wt_method(const ITensorInfo &weights); + + std::unique_ptr<CpuReshape> _weights_reshape; + std::unique_ptr<kernels::CpuWeightsReshapeKernel> _weights_reshape_and_transpose_kernel; + std::unique_ptr<kernels::CpuIm2ColKernel> _im2col_kernel; + std::unique_ptr<CpuGemm> _mm_gemm; + std::unique_ptr<CpuGemmLowpMatrixMultiplyCore> _mm_gemmlowp; + std::unique_ptr<kernels::CpuCol2ImKernel> _col2im_kernel; + std::unique_ptr<CpuReshape> _reshape; + + TensorInfo _im2col_output; + TensorInfo _weights_reshaped; + TensorInfo _gemm_output; + TensorInfo _gemm_output_3d; + + DataLayout _data_layout; + + bool _skip_im2col; + bool _skip_col2im; + bool _is_quantized; + bool _is_prepared; + WeightTransformMethod _wt_method; + bool _run_wt; + + experimental::MemoryRequirements _aux_mem{Count}; +}; +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_OPERATORS_CPUGEMMCONV2D_H diff --git a/src/cpu/operators/CpuGemmDirectConv2d.cpp b/src/cpu/operators/CpuGemmDirectConv2d.cpp new file mode 100644 index 0000000000..9187927541 --- /dev/null +++ b/src/cpu/operators/CpuGemmDirectConv2d.cpp @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuGemmDirectConv2d.h" + +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/FunctionDescriptors.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/utils/CpuAuxTensorHandler.h" +#include "support/Cast.h" + +#include <set> + +namespace arm_compute +{ +namespace cpu +{ +using namespace arm_compute::experimental; +using namespace arm_compute::utils::cast; + +namespace +{ +GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const ActivationLayerInfo &act) +{ + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo iqinfo = src->quantization_info(); + const QuantizationInfo wqinfo = weights->quantization_info(); + const QuantizationInfo oqinfo = (dst->total_size() == 0) ? iqinfo : dst->quantization_info(); + const UniformQuantizationInfo uoqinfo = oqinfo.uniform(); + const DataType data_type = src->data_type(); + // Merge activation with output stage + const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; + PixelValue type_min{}; + PixelValue type_max{}; + std::tie(type_min, type_max) = get_min_max(data_type); + int32_t min_activation = type_min.get<int32_t>(); + int32_t max_activation = type_max.get<int32_t>(); + if (supported_acts.count(act.activation()) != 0) + { + std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo); + } + GEMMLowpOutputStageInfo os_info; + os_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + os_info.gemmlowp_offset = uoqinfo.offset; + os_info.gemmlowp_min_bound = min_activation; + os_info.gemmlowp_max_bound = max_activation; + os_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL); + quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, os_info); + return os_info; +} +cpu::AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect) +{ + cpu::AsmGemmInfo asm_info; + asm_info.method = is_indirect ? cpu::AsmConvMethod::Indirect : cpu::AsmConvMethod::Conv; + asm_info.ps_info = info.conv_info; + asm_info.activation_info = info.act_info; + asm_info.depth_output_gemm3d = true; + asm_info.reinterpret_input_as_3d = true; + asm_info.padding_top = info.conv_info.pad_top(); + asm_info.padding_left = info.conv_info.pad_left(); + asm_info.padding_value = 0.f; + asm_info.negated_offsets = false; + asm_info.fast_mode = info.enable_fast_math; + asm_info.fixed_format = info.weights_info.weight_format() != WeightFormat::UNSPECIFIED; + asm_info.weight_format = info.weights_info.weight_format(); + return asm_info; +} +} // namespace + +CpuGemmDirectConv2d::CpuGemmDirectConv2d() + : _gemm_asm_func(std::make_unique<CpuGemmAssemblyDispatch>()), + _activation_func(std::make_unique<CpuActivation>()), + _weights_permute_func(std::make_unique<CpuPermute>()), + _aux_mem(AuxTensorIdx::Count), + _perm_weights(), + _run_activation(false), + _is_prepared(false) +{ +} + +CpuGemmDirectConv2d::~CpuGemmDirectConv2d() = default; + +void CpuGemmDirectConv2d::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_ERROR_THROW_ON( + CpuGemmDirectConv2d::validate(src, weights, biases != nullptr ? biases : nullptr, dst, info)); + ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info); + + _run_activation = info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info); + _is_prepared = false; + + _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{3, 0, 1, 2}); + + // Configure assembly dispatch + cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false); + if (is_data_type_quantized(src->data_type())) + { + asm_info.output_stage = calculate_output_stage_metadata(src, weights, dst, info.act_info); + } + _gemm_asm_func->configure(src, &_perm_weights, biases, dst, asm_info); + + // Configure activation + if (_run_activation) + { + _activation_func->configure(dst, nullptr, info.act_info); + } + + // Add auxiliary memory requirements of the assembly dispatch + const auto asm_mem_req = _gemm_asm_func->workspace(); + for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot) + { + _aux_mem[slot] = asm_mem_req[slot]; + } + + if (_aux_mem[Pretranspose].size > 0) + { + // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch + _aux_mem[PermutedWeights] = + MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size()); + } + else + { + // We must permute weights if they are WeightFormat::UNSPECIFIED + if (info.weights_info.weight_format() == WeightFormat::UNSPECIFIED) + _aux_mem[PermutedWeights] = + MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size()); + } +} +Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv2dInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::BFLOAT16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, + DataType::F16, DataType::F32); + if (!is_fixed_format(info.weights_info.weight_format())) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); + } + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.num_groups > 1, "Grouping (num_groups != 1) is not supported on Neon"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Data layout supported is NHWC"); + const DataType data_type = src->data_type(); + const TensorShape i_shape = src->tensor_shape(); + const TensorShape w_shape = weights->tensor_shape(); + ARM_COMPUTE_RETURN_ERROR_ON(w_shape[0] != i_shape[0]); + ARM_COMPUTE_RETURN_ERROR_ON(info.dilation != Size2D(1U, 1U)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); + // Validate biases + if (biases != nullptr) + { + if (is_data_type_quantized_asymmetric(data_type)) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); + } + else if (data_type == DataType::BFLOAT16) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); + } + ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3)); + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); + } + + cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuGemmAssemblyDispatch::validate(src, weights, biases, dst, asm_info)); + return Status{}; +} +void CpuGemmDirectConv2d::run(ITensorPack &tensors) +{ + prepare(tensors); + + _gemm_asm_func->run(tensors); + if (_run_activation) + { + ITensor *io = tensors.get_tensor(ACL_DST); + ITensorPack pack{{ACL_SRC, io}, {ACL_DST, io}}; + _activation_func->run(pack); + } +} + +void CpuGemmDirectConv2d::prepare(ITensorPack &tensors) +{ + if (!_is_prepared) + { + // If we are using fixed-format kernel the weights are already reshaped + if (_gemm_asm_func && _gemm_asm_func->isVarWeightsKernel()) + { + _gemm_asm_func->prepare(tensors); + _is_prepared = true; + return; + } + const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1); + ITensor *weights_aux = + utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights))); + ARM_COMPUTE_ERROR_ON_NULLPTR(weights, weights_aux); + + CpuAuxTensorHandler permuted_weights(_perm_weights, *weights_aux); + ITensorPack permute_tensors{{ACL_SRC, weights}, {ACL_DST, permuted_weights.get()}}; + _weights_permute_func->run(permute_tensors); + + tensors.add_const_tensor(ACL_SRC_1, permuted_weights.get()); + // Call prepare of assembly dispatch + _gemm_asm_func->prepare(tensors); + + _is_prepared = true; + } +} + +experimental::MemoryRequirements CpuGemmDirectConv2d::workspace() const +{ + return _aux_mem; +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuGemmDirectConv2d.h b/src/cpu/operators/CpuGemmDirectConv2d.h new file mode 100644 index 0000000000..a7365615b9 --- /dev/null +++ b/src/cpu/operators/CpuGemmDirectConv2d.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_CPUGEMMDIRECTCONV2D_H +#define ACL_SRC_CPU_OPERATORS_CPUGEMMDIRECTCONV2D_H + +#include "arm_compute/core/TensorInfo.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuOperator.h" +#include "src/cpu/operators/CpuActivation.h" +#include "src/cpu/operators/CpuPermute.h" +#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h" + +namespace arm_compute +{ +// Forward declarations +class ITensor; +struct Conv2dInfo; +namespace cpu +{ +class CpuGemmDirectConv2d : public ICpuOperator +{ +public: + CpuGemmDirectConv2d(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmDirectConv2d); + ~CpuGemmDirectConv2d(); + /** Set the input and output tensors. + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:--------------|:--------------|:--------------| + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |BFLOAT16 |BFLOAT16 |BFLOAT16 |BFLOAT16 | + * + * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32. + * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * @param[in] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] info Contains padding and stride information described in @ref PadStrideInfo. + */ + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &info); + /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmDirectConv2d + * + * Similar to CpuGemmDirectConv2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv2dInfo &info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; + experimental::MemoryRequirements workspace() const override; + +private: + enum AuxTensorIdx + { + GemmTemp0 = 0, + GemmTemp1, + Pretranspose, + /* Slots above (0-2) are reserved for CpuGemmAssemblyDispatch */ + PermutedWeights, + Count + }; + + std::unique_ptr<CpuGemmAssemblyDispatch> _gemm_asm_func; + std::unique_ptr<CpuActivation> _activation_func; + std::unique_ptr<CpuPermute> _weights_permute_func; + experimental::MemoryRequirements _aux_mem; + TensorInfo _perm_weights; + bool _run_activation; + bool _is_prepared; +}; +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_OPERATORS_CPUGEMMDIRECTCONV2D_H diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp new file mode 100644 index 0000000000..f3396fbb5c --- /dev/null +++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp @@ -0,0 +1,779 @@ +/* + * Copyright (c) 2021-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" +#include "arm_compute/runtime/TensorAllocator.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h" +#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h" +#include "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h" +#include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h" +#include "src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h" +#include "src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h" +#include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h" +#include "src/cpu/operators/CpuActivation.h" +#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h" +#include "src/cpu/utils/CpuAuxTensorHandler.h" + +using namespace arm_compute::misc::shape_calculator; +using namespace arm_compute::experimental; + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) +{ + cpu::AsmGemmInfo asm_info; + asm_info.method = cpu::AsmConvMethod::Im2Col; + asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d(); + asm_info.depth_output_gemm3d = info.depth_output_gemm3d(); + asm_info.activation_info = info.activation_info(); + asm_info.output_stage = info.gemmlowp_output_stage(); + asm_info.fast_mode = info.fast_math(); + asm_info.accumulate = info.accumulate(); + + return asm_info; +} +} // namespace + +CpuGemmLowpMatrixMultiplyCore::CpuGemmLowpMatrixMultiplyCore() + : _asm_glue(std::make_unique<CpuGemmAssemblyDispatch>()), + _mm_kernel(), + _mtx_a_reshape_kernel(), + _mtx_b_reshape_kernel(), + _mtx_a_reduction_kernel(), + _mtx_b_reduction_kernel(), + _offset_contribution_kernel(), + _offset_contribution_output_stage_kernel(), + _activation_func(), + _convert_to_signed_asymm(), + _convert_from_signed_asymm(), + _vector_sum_col(), + _vector_sum_row(), + _tmp_a(), + _tmp_b(), + _mm_result_s32(), + _signed_a(), + _signed_output(), + _a_offset(0), + _b_offset(0), + _run_vector_matrix_multiplication(false), + _assembly_path(false), + _fused_assembly_path(false), + _reshape_b_only_on_first_run(false), + _is_prepared(false), + _fuse_output_stage(false), + _run_activation(false), + _flip_signedness(false), + _gemm_info(), + _aux_mem(Count) +{ +} +CpuGemmLowpMatrixMultiplyCore::~CpuGemmLowpMatrixMultiplyCore() = default; + +void CpuGemmLowpMatrixMultiplyCore::configure( + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, dst); + ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, dst, gemm_info)); + ARM_COMPUTE_LOG_PARAMS(a, b, c, dst, gemm_info); + + const ITensorInfo *matrix_a = a; + const ITensorInfo *matrix_b = b; + GEMMInfo info = gemm_info; + + // Set internal variables + _a_offset = a->quantization_info().uniform().offset; + _b_offset = b->quantization_info().uniform().offset; + _run_vector_matrix_multiplication = a->dimension(1) < 2; + _reshape_b_only_on_first_run = b->are_values_constant(); + _is_prepared = false; + _fused_assembly_path = false; + _flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && + _reshape_b_only_on_first_run; + _gemm_info = gemm_info; + + // Offset kernel is need if offset is non-zero or it may change (i.e. dynamic). + // It is not needed if the datatype is symmetric, because there is no offset + bool a_offset_kernel_needed = _a_offset != 0 || a->quantization_info().is_dynamic(); + bool b_offset_kernel_needed = _b_offset != 0 || b->quantization_info().is_dynamic(); + + _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>(); + + const ITensorInfo *a_to_use = a; + + // Convert to QASYMM8 -> QASYMM8_SIGNED and back + if (_flip_signedness) + { + const int32_t offset_correction = 128; + const DataType dt = DataType::QASYMM8_SIGNED; + const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform(); + + _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info( + QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); + _convert_to_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>(); + _convert_to_signed_asymm->configure(a_to_use, &_signed_a); + a_to_use = &_signed_a; + _a_offset = _signed_a.quantization_info().uniform().offset; + + const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); + _signed_output = dst->clone()->set_data_type(dt).set_quantization_info( + QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)); + + // Output stage correction + GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage(); + output_stage_corr.gemmlowp_offset = _signed_output.quantization_info().uniform().offset; + output_stage_corr.gemmlowp_min_bound -= offset_correction; + output_stage_corr.gemmlowp_max_bound -= offset_correction; + info.set_gemmlowp_output_stage(output_stage_corr); + + // Update matrix a + matrix_a = &_signed_a; + } + + // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage + if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) + { + _fuse_output_stage = true; + _mm_result_s32 = TensorInfo(dst->tensor_shape(), 1, DataType::S32); + } + + // Initialize assembly kernel meta-data + const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info); +#ifdef __aarch64__ + if (!(!b->are_values_constant() && + b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently. + { + switch (a->data_type()) + { + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::U8: + case DataType::S8: + { + if (is_data_type_quantized_asymmetric(a_to_use->data_type()) && + info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + auto c_info_to_use = c == nullptr ? nullptr : c; + _asm_glue->configure(a_to_use, b, c_info_to_use, dst, asm_info); + _fused_assembly_path = _asm_glue->is_configured(); + } + else + { + auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 : dst); + _asm_glue->configure(a_to_use, b, nullptr, output_to_use, asm_info); + } + _assembly_path = _asm_glue->is_configured(); + break; + } + default: + { + ARM_COMPUTE_ERROR("Datatype not supported"); + break; + } + } + } +#endif /* __aarch64__ */ + if (!(_assembly_path || _run_vector_matrix_multiplication)) + { + matrix_a = &_tmp_a; + matrix_b = &_tmp_b; + + // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] + _tmp_a = + TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info()); + // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] + _tmp_b = TensorInfo(compute_transpose1xW_shape(*b), 1, b->data_type(), b->quantization_info()); + + // Configure interleave kernel + _mtx_a_reshape_kernel = std::make_unique<kernels::CpuGemmInterleave4x4Kernel>(); + _mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a); + + // Configure transpose kernel + _mtx_b_reshape_kernel = std::make_unique<kernels::CpuGemmTranspose1xWKernel>(); + _mtx_b_reshape_kernel->configure(b, &_tmp_b); + } + + if (!_fused_assembly_path) + { + // Build reduction info + const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false); + + if (a_offset_kernel_needed) + { + _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); + + // Configure Matrix B reduction kernel + _mtx_b_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixBReductionKernel>(); + _mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info); + } + + if (b_offset_kernel_needed) + { + _vector_sum_row = TensorInfo(compute_reductionB_shape(*a_to_use), 1, DataType::S32); + + // Configure matrix A reduction kernel + _mtx_a_reduction_kernel = std::make_unique<kernels::CpuGemmLowpMatrixAReductionKernel>(); + _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info); + } + + if (_fuse_output_stage) + { + // Configure matrix multiply kernel + if (!_assembly_path) + { + _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>(); + _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32); + } + + _offset_contribution_output_stage_kernel = + std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>(); + _offset_contribution_output_stage_kernel->configure( + &_mm_result_s32, a_offset_kernel_needed ? &_vector_sum_col : nullptr, + b_offset_kernel_needed ? &_vector_sum_row : nullptr, c, _flip_signedness ? &_signed_output : dst, + a->dimension(0), _a_offset, _b_offset, info.gemmlowp_output_stage()); + + if (_flip_signedness) + { + _convert_from_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>(); + _convert_from_signed_asymm->configure(&_signed_output, dst); + } + } + else + { + // This scale is needed for the s8_f32 kernel where the multiplication output is dequantized to F32. + const float dequantize_scale = + (dst->data_type() == DataType::F32) + ? a->quantization_info().uniform().scale * b->quantization_info().uniform().scale + : 1.0f; + // Configure matrix multiply kernel + if (!_assembly_path) + { + _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>(); + _mm_kernel->configure(matrix_a, matrix_b, dst); + } + // Configure offset contribution kernel + _offset_contribution_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionKernel>(); + _offset_contribution_kernel->configure(dst, a_offset_kernel_needed ? &_vector_sum_col : nullptr, + b_offset_kernel_needed ? &_vector_sum_row : nullptr, + a_to_use->dimension(0), _a_offset, _b_offset, dequantize_scale); + } + } + // Configure activation + const ActivationLayerInfo &activation = gemm_info.activation_info(); + _run_activation = + activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation)); + if (_run_activation) + { + _activation_func = std::make_unique<CpuActivation>(); + _activation_func->configure(dst, nullptr, activation); + } + + if (_assembly_path) + { + const auto asm_mem_req = _asm_glue->workspace(); + for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot) + { + _aux_mem[slot] = asm_mem_req[slot]; + } + } + + // Request memory for LHS and RHS reshape matrix + _aux_mem[VectorSumCol] = MemoryInfo(offset_int_vec(VectorSumCol), + !_fused_assembly_path && a_offset_kernel_needed && _reshape_b_only_on_first_run + ? MemoryLifetime::Persistent + : MemoryLifetime::Temporary, + _vector_sum_col.total_size()); + _aux_mem[VectorSumRow] = + MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size()); + _aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size()); + _aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, + _tmp_b.total_size()); + _aux_mem[MMResultS32] = + MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size()); + _aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size()); + _aux_mem[SignedOutput] = + MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size()); +} + +Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && output->data_type() != DataType::F32 && + gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, + "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (a)->dimension(0) != (b)->dimension(1), + "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); + + // When using accumulation(in place summation), for now, the only supported DataType for output is S32. + if (gemm_info.accumulate()) + { +#ifdef __arm__ + ARM_COMPUTE_RETURN_ERROR_MSG("Accumulation is not supported for armv7"); +#endif /* __arm__ */ + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE, + "Accumulation is not supported for output QASYMM8/QASYMM8_SIGNED"); + } + + GEMMInfo info = gemm_info; + const ITensorInfo *matrix_a_info = a; + const ITensorInfo *matrix_b_info = b; + + const ITensorInfo *a_to_use = a; + + TensorInfo tmp_a_info{}; + TensorInfo tmp_b_info{}; + TensorInfo mm_result_s32_info{}; + + int32_t a_offset = a->quantization_info().uniform().offset; + int32_t b_offset = b->quantization_info().uniform().offset; + + // Offset kernel is need if offset is non-zero or it may change (i.e. dynamic). + bool a_offset_kernel_needed = a_offset != 0 || a->quantization_info().is_dynamic(); + bool b_offset_kernel_needed = b_offset != 0 || b->quantization_info().is_dynamic(); + + bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE; + if (fuse_output_stage) + { + auto_init_if_empty(mm_result_s32_info, + a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32)); + } + + // Convert QASYMM8->QASYMM8_SIGNED + TensorInfo signed_a{}; + TensorInfo signed_output{}; + bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && + (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run(); + if (flip_signedness) + { + const int32_t offset_correction = 128; + const DataType dt = DataType::QASYMM8_SIGNED; + const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform(); + + signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info( + QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a)); + a_to_use = &signed_a; + a_offset = signed_a.quantization_info().uniform().offset; + + const UniformQuantizationInfo oqinfo = output->quantization_info().uniform(); + signed_output = output->clone()->set_data_type(dt).set_quantization_info( + QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)); + + // Output stage correction + GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage(); + output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset; + output_stage_corr.gemmlowp_min_bound -= offset_correction; + output_stage_corr.gemmlowp_max_bound -= offset_correction; + info.set_gemmlowp_output_stage(output_stage_corr); + + // Update matrix a + matrix_a_info = &signed_a; + } + + // Initialize assembly kernel meta-data + const AsmGemmInfo asm_info = init_assembly_metadata(info); + + // Check if we need to run the optimized assembly kernel + bool run_optimised = false; + bool run_optimised_requantized = false; + + if (!(!b->are_values_constant() && + b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently. + { + if (is_data_type_quantized_asymmetric(a_to_use->data_type()) && + info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info)); + run_optimised_requantized = run_optimised; + } + else + { + run_optimised = bool(CpuGemmAssemblyDispatch::validate( + a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info)); + } + } + + if (run_optimised) + { + ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0)); + if (info.depth_output_gemm3d() != 0) + { + if (info.reinterpret_input_as_3d()) + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2)); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), + "NEGEMM cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, + "NEGEMM cannot reinterpret the output tensor as 3D"); + + const bool run_vector_matrix_multiplication = a->dimension(1) < 2; + if (!run_vector_matrix_multiplication) + { + matrix_a_info = &tmp_a_info; + matrix_b_info = &tmp_b_info; + + // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ] + TensorShape shape_tmp_a = a->tensor_shape(); + shape_tmp_a.set(0, a->dimension(0) * 4); + shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f)); + + // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ] + TensorShape shape_tmp_b = b->tensor_shape(); + shape_tmp_b.set(0, b->dimension(1) * 16); + shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f)); + + // Validate interleave kernel + auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a)); + auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b)); + + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmInterleave4x4Kernel::validate(a_to_use, &tmp_a_info)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info)); + } + } + + if (!run_optimised_requantized) + { + TensorInfo info_vector_sum_col{}; + TensorInfo info_vector_sum_row{}; + + const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false); + + // Validate matrix B reduction kernel only if _a_offset is not equal to 0 + if (a_offset_kernel_needed) + { + info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); + + // Configure Matrix B reduction kernel + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info)); + } + + // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 + if (b_offset_kernel_needed) + { + info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); + + // Configure matrix A reduction kernel + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info)); + } + + if (fuse_output_stage) + { + if (!run_optimised) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + info.reinterpret_input_as_3d(), + "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + info.depth_output_gemm3d() != 0, + "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D"); + + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate( + matrix_a_info, matrix_b_info, &mm_result_s32_info)); + } + + // Validate offset contribution kernel + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate( + &mm_result_s32_info, a_offset_kernel_needed ? &info_vector_sum_col : nullptr, + b_offset_kernel_needed ? &info_vector_sum_row : nullptr, c, flip_signedness ? &signed_output : output, + a_offset, b_offset, info.gemmlowp_output_stage())); + } + else + { + if (!run_optimised) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + info.reinterpret_input_as_3d(), + "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + info.depth_output_gemm3d() != 0, + "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D"); + + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output)); + } + // Validate offset contribution kernel + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate( + output, a_offset_kernel_needed ? &info_vector_sum_col : nullptr, + b_offset_kernel_needed ? &info_vector_sum_row : nullptr, a_offset, b_offset)); + } + } + + // Validate activation + const ActivationLayerInfo &activation = gemm_info.activation_info(); + if (activation.enabled()) + { + ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, activation)); + } + + return Status{}; +} + +void CpuGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) +{ + prepare(tensors); + + auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + auto a_to_use = a; + auto matrix_a = a; + auto matrix_b = b; + + CpuAuxTensorHandler vector_sum_col(offset_int_vec(VectorSumCol), _vector_sum_col, tensors, false); + CpuAuxTensorHandler vector_sum_row(offset_int_vec(VectorSumRow), _vector_sum_row, tensors, false); + CpuAuxTensorHandler tmp_a(offset_int_vec(TmpA), _tmp_a, tensors, false); + CpuAuxTensorHandler tmp_b(offset_int_vec(TmpB), _tmp_b, tensors, true); + CpuAuxTensorHandler mm_result_s32(offset_int_vec(MMResultS32), _mm_result_s32, tensors, false); + CpuAuxTensorHandler signed_a(offset_int_vec(SignedA), _signed_a, tensors, false); + CpuAuxTensorHandler signed_output(offset_int_vec(SignedOutput), _signed_output, tensors, false); + + const QuantizationInfo a_qinfo = a->info()->quantization_info(); + const QuantizationInfo b_qinfo = b->info()->quantization_info(); + + if (a_qinfo.is_dynamic()) + _a_offset = a_qinfo.uniform().offset; + if (b_qinfo.is_dynamic()) + _b_offset = b_qinfo.uniform().offset; + + // Convert QASYMM8->QASYMM8_SIGNED + if (_flip_signedness) + { + ITensorPack pack = {{TensorType::ACL_SRC, a}, {TensorType::ACL_DST, signed_a.get()}}; + NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(), + pack); + a_to_use = signed_a.get(); + matrix_a = signed_a.get(); + } + + // Run GEMM + if (_asm_glue->is_configured()) + { + ITensorPack asm_glue_tensors = tensors; + auto output_to_use = (_fuse_output_stage ? mm_result_s32.get() : dst); + if (is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && + _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use); + asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b); + asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_2, c); + asm_glue_tensors.add_tensor(TensorType::ACL_DST, dst); + } + else + { + asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use); + asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b); + asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use); + } + _asm_glue->run(asm_glue_tensors); + } + else + { + if (!_run_vector_matrix_multiplication) + { + matrix_a = tmp_a.get(); + matrix_b = tmp_b.get(); + // Run interleave kernel + ITensorPack pack_a = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, tmp_a.get()}}; + NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(), + pack_a); + + if (!_reshape_b_only_on_first_run) + { + ITensorPack pack_b = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, tmp_b.get()}}; + // Run transpose kernel + NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, + _mtx_b_reshape_kernel->window(), pack_b); + } + } + ITensorPack pack_mm = {{TensorType::ACL_SRC_0, matrix_a}, {TensorType::ACL_SRC_1, matrix_b}}; + if (_fuse_output_stage) + { + pack_mm.add_tensor(TensorType::ACL_DST, mm_result_s32.get()); + } + else + { + pack_mm.add_tensor(TensorType::ACL_DST, dst); + } + NEScheduler::get().schedule_op(_mm_kernel.get(), Window::DimY, _mm_kernel->window(), pack_mm); + } + + if (!_fused_assembly_path) + { + // Run matrix A reduction kernel only if _b_offset is not equal to 0 + if (_b_offset != 0) + { + ITensorPack pack = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, vector_sum_row.get()}}; + NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX, + _mtx_a_reduction_kernel->window(), pack); + } + + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if (_a_offset != 0 && !_reshape_b_only_on_first_run) + { + ITensorPack pack = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, vector_sum_col.get()}}; + NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, + _mtx_b_reduction_kernel->window(), pack); + } + + if (_fuse_output_stage) + { + if (a_qinfo.is_dynamic()) + _offset_contribution_output_stage_kernel->set_a_offset(_a_offset); + if (b_qinfo.is_dynamic()) + _offset_contribution_output_stage_kernel->set_b_offset(_b_offset); + + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, mm_result_s32.get()); + pack.add_tensor(TensorType::ACL_SRC_1, _a_offset == 0 ? nullptr : vector_sum_col.get()); + pack.add_tensor(TensorType::ACL_SRC_2, _b_offset == 0 ? nullptr : vector_sum_row.get()); + pack.add_tensor(TensorType::ACL_SRC_3, c); + pack.add_tensor(TensorType::ACL_DST, _flip_signedness ? signed_output.get() : dst); + + // Run offset contribution kernel + NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY, + _offset_contribution_output_stage_kernel->window(), pack); + } + else + { + if (a_qinfo.is_dynamic()) + _offset_contribution_kernel->set_a_offset(_a_offset); + if (b_qinfo.is_dynamic()) + _offset_contribution_kernel->set_b_offset(_b_offset); + if (a_qinfo.is_dynamic() || b_qinfo.is_dynamic()) + { + const float dequantize_scale = a_qinfo.uniform().scale * b_qinfo.uniform().scale; + _offset_contribution_kernel->set_scale(dequantize_scale); + } + + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC_0, _a_offset == 0 ? nullptr : vector_sum_col.get()); + pack.add_tensor(TensorType::ACL_SRC_1, _b_offset == 0 ? nullptr : vector_sum_row.get()); + pack.add_tensor(TensorType::ACL_DST, dst); + + // Run offset contribution kernel + NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY, + _offset_contribution_kernel->window(), pack); + } + } + + // Convert QASYMM8_SIGNED->QASYMM8 + if (!_fused_assembly_path && _fuse_output_stage && _flip_signedness) + { + ITensorPack pack = {{TensorType::ACL_SRC, signed_output.get()}, {TensorType::ACL_DST, dst}}; + NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY, + _convert_from_signed_asymm->window(), pack); + } + + // Run fused activation unless already run in the fused assembly + if (_run_activation) + { + ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}}; + _activation_func->run(pack); + } +} + +void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors) +{ + if (!_is_prepared) + { + auto original_b = tensors.get_const_tensor(TensorType::ACL_SRC_1); + // Run assembly reshape + if (_asm_glue->is_configured()) + { + _asm_glue->prepare(tensors); + } + // Run non-assembly reshape + else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured()) + { + // Run reshape kernel and mark original weights tensor as unused + ITensor *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB))); + CpuAuxTensorHandler tmp_b(_tmp_b, *tmp_b_p); + ITensorPack pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, tmp_b.get()}}; + NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), + pack); + } + + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if (!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run) + { + ITensor *vector_sum_col_p = + utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol))); + CpuAuxTensorHandler vector_sum_col(_vector_sum_col, *vector_sum_col_p); + ITensorPack pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, vector_sum_col.get()}}; + NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, + _mtx_b_reduction_kernel->window(), pack); + } + _is_prepared = true; + } +} +experimental::MemoryRequirements CpuGemmLowpMatrixMultiplyCore::workspace() const +{ + return _aux_mem; +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h new file mode 100644 index 0000000000..38121c9bb4 --- /dev/null +++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2021, 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_CPUGEMMLOWPMATRIXMULTIPLYCORE_H +#define ACL_SRC_CPU_OPERATORS_CPUGEMMLOWPMATRIXMULTIPLYCORE_H + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/function_info/GEMMInfo.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuOperator.h" + +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +class CpuGemmInterleave4x4Kernel; +class CpuGemmLowpMatrixMultiplyKernel; +class CpuGemmLowpOffsetContributionKernel; +class CpuGemmLowpOffsetContributionOutputStageKernel; +class CpuGemmLowpMatrixAReductionKernel; +class CpuGemmLowpMatrixBReductionKernel; +class CpuGemmTranspose1xWKernel; +class CpuConvertQuantizedSignednessKernel; +} // namespace kernels +class CpuGemmAssemblyDispatch; +class CpuActivation; + +/** Basic function to execute GEMMLowpMatrixMultiplyCore. This function calls the following kernels if the DOT product instruction is not available: + * + * -# @ref kernels::CpuGemmInterleave4x4Kernel + * -# @ref kernels::CpuGemmTranspose1xWKernel + * -# @ref kernels::CpuGemmLowpMatrixMultiplyKernel + * -# @ref kernels::CpuGemmLowpOffsetContributionKernel + * -# @ref CpuActivation + * + * otherwise if the DOT product instruction is available: + * + * -# @ref kernels::CpuGemmLowpOffsetContributionKernel + * +*/ +class CpuGemmLowpMatrixMultiplyCore : public ICpuOperator +{ +public: + /** Constructor */ + CpuGemmLowpMatrixMultiplyCore(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmLowpMatrixMultiplyCore); + /** Destructor */ + ~CpuGemmLowpMatrixMultiplyCore(); + /** Initialise the kernel's inputs, output + * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:--------|:--------------| + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | + * |QASYMM8 |QSYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QASYMM8 |S32 |S32 | + * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |S32 | + * |QASYMM8 |QSYMM8 |S32 |S32 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QSYMM8 |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |S32 | + * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |S32 | + * |QASYMM8_SIGNED |QSYMM8 |S32 |S32 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |F32 |F32 | + * + * @note GEMM_LOWP: low precision GEMM kernel + * This kernel performs the following computations: + * + * -# Convert a values from QASYMM8 to int32 and add a_offset to each of them. + * -# Convert b values from QASYMM8 to int32 add b_offset to each of them. + * -# Compute the matrix product of the resulting a * b in int32. + * + * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED/F32 otherwise + * + * @param[in] a First input tensor info (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED. + * @param[in] b Second input tensor info (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL. + * @param[in] c Third input tensor info (Matrix C). It can be a nullptr. Data type supported: S32/F32 + * @param[out] dst Output tensor info. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED/F32 + * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and + * if the reshape of matrix B should be executed only for the first run + */ + void configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *dst, + const GEMMInfo &gemm_info = GEMMInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpMatrixMultiplyCore::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *dst, + const GEMMInfo &gemm_info = GEMMInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + enum AuxTensorIdx + { + /* Slots 0 - 2 reserved for CpuGemmAssemblyDispatch */ + VectorSumCol = 3, + VectorSumRow, + TmpA, + TmpB, + MMResultS32, + SignedA, + SignedOutput, + Count + }; + + std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue; + std::unique_ptr<kernels::CpuGemmLowpMatrixMultiplyKernel> _mm_kernel; + std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel> _mtx_a_reshape_kernel; + std::unique_ptr<kernels::CpuGemmTranspose1xWKernel> _mtx_b_reshape_kernel; + std::unique_ptr<kernels::CpuGemmLowpMatrixAReductionKernel> _mtx_a_reduction_kernel; + std::unique_ptr<kernels::CpuGemmLowpMatrixBReductionKernel> _mtx_b_reduction_kernel; + std::unique_ptr<kernels::CpuGemmLowpOffsetContributionKernel> _offset_contribution_kernel; + std::unique_ptr<kernels::CpuGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel; + std::unique_ptr<CpuActivation> _activation_func; + std::unique_ptr<kernels::CpuConvertQuantizedSignednessKernel> _convert_to_signed_asymm; + std::unique_ptr<kernels::CpuConvertQuantizedSignednessKernel> _convert_from_signed_asymm; + + TensorInfo _vector_sum_col; + TensorInfo _vector_sum_row; + TensorInfo _tmp_a; + TensorInfo _tmp_b; + TensorInfo _mm_result_s32; + TensorInfo _signed_a; + TensorInfo _signed_output; + int32_t _a_offset; + int32_t _b_offset; + + bool _run_vector_matrix_multiplication; + bool _assembly_path; + bool _fused_assembly_path; + bool _reshape_b_only_on_first_run; + bool _is_prepared; + bool _fuse_output_stage; + bool _run_activation; + bool _flip_signedness; + GEMMInfo _gemm_info; + experimental::MemoryRequirements _aux_mem{}; +}; +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_OPERATORS_CPUGEMMLOWPMATRIXMULTIPLYCORE_H diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.cpp b/src/cpu/operators/CpuGemmLowpOutputStage.cpp new file mode 100644 index 0000000000..4215eed199 --- /dev/null +++ b/src/cpu/operators/CpuGemmLowpOutputStage.cpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuGemmLowpOutputStage.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h" +#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h" +#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h" +#include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuGemmLowpOutputStage::configure(ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info) +{ + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpOutputStage::validate(src, bias, dst, info)); + ARM_COMPUTE_LOG_PARAMS(src, bias, dst, info); + + switch (info.type) + { + case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: + { + switch (info.output_data_type) + { + case DataType::QASYMM8: + { + auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>(); + k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, + info.gemmlowp_min_bound, info.gemmlowp_max_bound); + _kernel = std::move(k); + break; + } + case DataType::QASYMM8_SIGNED: + { + auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>(); + k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, + info.gemmlowp_min_bound, info.gemmlowp_max_bound); + _kernel = std::move(k); + break; + } + case DataType::QSYMM16: + { + auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>(); + k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, + info.gemmlowp_max_bound); + _kernel = std::move(k); + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported output data type."); + break; + } + } + break; + } + case GEMMLowpOutputStageType::QUANTIZE_DOWN: + { + switch (info.output_data_type) + { + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + { + auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ScaleKernel>(); + k->configure(src, bias, dst, &info); + _kernel = std::move(k); + break; + } + default: + { + ARM_COMPUTE_ERROR("Unsupported output data type."); + break; + } + } + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type."); + } +} + +Status CpuGemmLowpOutputStage::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::UNKNOWN, + "CpuGemmLowpOutputStage cannot be used with UNKNOWN output data type."); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) && + (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)); + + switch (info.type) + { + case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: + { + switch (dst->data_type()) + { + case DataType::QASYMM8: + return kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate( + src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + case DataType::QASYMM8_SIGNED: + return kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate( + src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + case DataType::QSYMM16: + return kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate( + src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound); + default: + return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type."); + } + } + case GEMMLowpOutputStageType::QUANTIZE_DOWN: + { + switch (dst->data_type()) + { + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + return kernels::CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(src, bias, dst, &info); + default: + return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type."); + } + } + default: + return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type."); + } +} + +void CpuGemmLowpOutputStage::run(ITensorPack &tensors) +{ + NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.h b/src/cpu/operators/CpuGemmLowpOutputStage.h new file mode 100644 index 0000000000..e5e2f41fa9 --- /dev/null +++ b/src/cpu/operators/CpuGemmLowpOutputStage.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H +#define ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H + +#include "arm_compute/core/Types.h" + +#include "src/cpu/ICpuOperator.h" + +/** This file contains all available output stages for GEMMLowp. + * + * In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyCore), + * and processes it to obtain the final ASYMM8 value. + * + * More information about the GEMMLowp output stage can be found at https://github.com/google/gemmlowp/blob/master/doc/output.md + */ + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to execute GEMMLowpQuantizeDown kernels. + * + * This function calls the following kernels: + * + * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ScaleKernel + * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel + * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel + * -# @ref kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel +*/ +class CpuGemmLowpOutputStage : public ICpuOperator +{ +public: + /** Initialise the kernel's inputs, output + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |dst | + * |:--------------|:-------------|:-------------| + * |S32 |S32 |QASYMM8 | + * |S32 |S32 |QASYMM8_SIGNED| + * |S32 |S32 |QSYMM16 | + * + * @param[in] src Input tensor info. Data type supported: S32 + * @param[in] bias Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] dst Output tensor info. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16 + * @param[in] info GEMMLowp output stage metadata. + */ + void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuGemmLowpOutputStage::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H */ diff --git a/src/cpu/operators/CpuMatMul.cpp b/src/cpu/operators/CpuMatMul.cpp new file mode 100644 index 0000000000..f68ae9883f --- /dev/null +++ b/src/cpu/operators/CpuMatMul.cpp @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/operators/CpuMatMul.h" + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/function_info/MatMulInfo.h" +#include "arm_compute/runtime/NEON/functions/NEMatMul.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/utils/quantization/AsymmHelpers.h" +#include "src/cpu/utils/CpuAuxTensorHandler.h" + +using namespace arm_compute::experimental; + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +Status get_gemmlowp_output_stage_info(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const ActivationLayerInfo &act, + GEMMLowpOutputStageInfo &gemmlowp_output_stage_info) +{ + const auto data_type = src->data_type(); + const QuantizationInfo oq_info = dst->quantization_info(); + const UniformQuantizationInfo iq_unif = src->quantization_info().uniform(); + const UniformQuantizationInfo wq_unif = weights->quantization_info().uniform(); + const UniformQuantizationInfo oq_unif = oq_info.uniform(); + + float multiplier = (iq_unif.scale * wq_unif.scale) / oq_unif.scale; + int32_t output_multiplier; + int32_t output_shift; + + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + + int32_t type_min = 0; + int32_t type_max = 0; + std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type); + + gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier; + gemmlowp_output_stage_info.gemmlowp_shift = output_shift; + gemmlowp_output_stage_info.gemmlowp_offset = oq_unif.offset; + gemmlowp_output_stage_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + gemmlowp_output_stage_info.gemmlowp_min_bound = type_min; + gemmlowp_output_stage_info.gemmlowp_max_bound = type_max; + + return Status{}; +} +} // namespace + +CpuMatMul::CpuMatMul() + : _transpose_kernel_lhs(), + _transpose_kernel_rhs(), + _asm_glue(), + _lhs_transposed(), + _rhs_transposed(), + _original_lhs_shape(), + _original_rhs_shape(), + _original_dst_shape() +{ +} + +Status CpuMatMul::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::BFLOAT16, + DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->are_values_constant(), "LHS Tensor must be dynamic."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs->are_values_constant(), "RHS Tensor must be dynamic."); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(lhs); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(lhs); + + const auto adj_lhs = info.adj_lhs(); + const auto adj_rhs = info.adj_rhs(); + + const ITensorInfo *lhs_to_use = lhs; + const ITensorInfo *rhs_to_use = rhs; + TensorInfo lhs_transposed{}; + TensorInfo rhs_transposed{}; + + auto gemm_info = AsmGemmInfo(); + gemm_info.activation_info = act_info; + gemm_info.fast_mode = settings.fast_math(); + gemm_info.fixed_format = settings.fixed_format(); + + // Validate and then permute a/b + if (adj_lhs) + { + auto_init_if_empty(lhs_transposed, + lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*lhs))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(lhs_to_use, &lhs_transposed)); + // Assign lhs_to_use pointer to use transposed TensorInfo + lhs_to_use = &lhs_transposed; + } + if (adj_rhs) + { + auto_init_if_empty(rhs_transposed, + rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*rhs))); + ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(rhs_to_use, &rhs_transposed)); + // Assign rhs_to_use pointer to use transposed TensorInfo + rhs_to_use = &rhs_transposed; + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(0) != rhs_to_use->dimension(1), + "The product AB is defined only if the number of columns in A is equal to the " + "number of rows in B (after transpose)"); + + // Iterate over dimensions to be collapsed in operator - check dimensions are equivalent between tensors + for (unsigned int i = 2; i < Coordinates::num_max_dimensions; i++) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(i) != rhs_to_use->dimension(i), + "Broadcasting in Batch dimension is unsupported by this operator."); + } + + // Quantized-specific configuration + if (is_data_type_quantized(lhs->data_type())) + { + ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(lhs_to_use, rhs_to_use, dst, + gemm_info.activation_info, gemm_info.output_stage)); + } + + if (gemm_info.fixed_format) + { + gemm_info.weight_format = WeightFormat::ANY; + arm_compute::WeightFormat expected_weight_format = WeightFormat::ANY; + ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, lhs_to_use, + rhs_to_use, nullptr, dst, gemm_info)); + } + + cpu::CpuGemmAssemblyDispatch::validate(lhs_to_use, rhs_to_use, nullptr, dst, gemm_info); + + return Status{}; +} + +void CpuMatMul::configure(ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); + ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, info, settings); + ARM_COMPUTE_ERROR_THROW_ON(CpuMatMul::validate(lhs, rhs, dst, info, settings)); + + _adj_lhs = info.adj_lhs(); + _adj_rhs = info.adj_rhs(); + _fast_math = settings.fast_math(); + + // 1. Create and reshape tensors + // ------------------------------------------------------ + // a. Clone TensorInfo to prevent changing original tensor values during setup + // b. Change shape of lhs/dst to [x, y, 1, collapsed(z)] to match assembly kernel configuration + // c. For rhs collapse all dimensions larger than 3 to z dimension + TensorInfo lhs_to_use = *lhs->clone(); + TensorInfo dst_to_use = *dst->clone(); + TensorInfo rhs_to_use = *rhs->clone(); + + // Save starting shape of tensors + _original_lhs_shape = lhs_to_use.tensor_shape(); + _original_dst_shape = dst_to_use.tensor_shape(); + _original_rhs_shape = rhs_to_use.tensor_shape(); + + // Reshape lhs for use with assembly kernels. + lhs_to_use.set_tensor_shape( + TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z())); + dst_to_use.set_tensor_shape( + TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z())); + rhs_to_use.set_tensor_shape(_original_rhs_shape.collapsed_from(2)); + + // 2. Configuration for transpose of lhs/rhs + // ------------------------------------------------------ + // Initialise transposed TensorInfo class for aux tensors (intermediary tensors) + if (_adj_lhs) + { + // Setup transpose LHS + _transpose_kernel_lhs = std::make_unique<cpu::kernels::CpuTransposeKernel>(); + _transpose_kernel_lhs->configure(&lhs_to_use, &_lhs_transposed); + } + + if (_adj_rhs) + { + // Setup transpose RHS + _transpose_kernel_rhs = std::make_unique<cpu::kernels::CpuTransposeKernel>(); + _transpose_kernel_rhs->configure(&rhs_to_use, &_rhs_transposed); + } + + // 3. Configure assembly kernel using transposed tensors. + // ----------------------------------------------------- + // Use transposed tensors if the corresponding transpose flags are set + // Fill AsmGemmInfo class object before configuration + _gemm_info.activation_info = act_info; + _gemm_info.fast_mode = settings.fast_math(); + _gemm_info.fixed_format = settings.fixed_format(); + _gemm_info.negated_offsets = false; + + lhs_to_use = (_adj_lhs) ? _lhs_transposed : lhs_to_use; + rhs_to_use = (_adj_rhs) ? _rhs_transposed : rhs_to_use; + + // Quantized-specific configuration + if (is_data_type_quantized(lhs->data_type())) + { + get_gemmlowp_output_stage_info(&lhs_to_use, &rhs_to_use, &dst_to_use, _gemm_info.activation_info, + _gemm_info.output_stage); + } + + if (_gemm_info.fixed_format) + { + _gemm_info.weight_format = WeightFormat::ANY; + arm_compute::WeightFormat expected_weight_format = WeightFormat::ANY; + ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, &lhs_to_use, + &rhs_to_use, nullptr, dst, _gemm_info)); + // Set gemm weights info to the one returned by has_opt_impl + _gemm_info.weight_format = expected_weight_format; + // has_opt_impl may return a non fast math kernel, even if we requested one + _gemm_info.fast_mode = arm_compute::is_fixed_format_fast_math(expected_weight_format); + } + + // Configure Asm Kernel + _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>(); + _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use, + _gemm_info); // c is nullptr as bias not supported in MatMul + + // Specify memory requirements for intermediate tensors + auto asm_mem_req = _asm_glue->workspace(); + // Specify memory required by gemm kernel + int idx = 0; + for (const auto &aux : asm_mem_req) + { + _aux_mem[idx] = aux; + idx++; + } + // Memory requirements for transposed tensors + _aux_mem[TransposeLHS] = MemoryInfo(offset_int_vec(TransposeLHS), MemoryLifetime::Temporary, lhs->total_size()); + _aux_mem[TransposeRHS] = MemoryInfo(offset_int_vec(TransposeRHS), MemoryLifetime::Temporary, rhs->total_size()); +} + +void CpuMatMul::run(ITensorPack &tensors) +{ + // Retrieve tensors from tensor pack + auto lhs = tensors.get_tensor(ACL_SRC_0); + auto rhs = tensors.get_const_tensor(ACL_SRC_1); + auto dst = tensors.get_tensor(ACL_DST); + + // Reshape LHS and DST to ensure compatibility with GEMM asm kernel (Batch dimensions is 4th for lhs and dst within asm) + // Collapse RHS (necessary to support dimensions larger than 3 in gemm assembly) + lhs->info()->set_tensor_shape( + TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, + _original_lhs_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z + dst->info()->set_tensor_shape( + TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, + _original_dst_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z + rhs->info()->set_tensor_shape(_original_rhs_shape.collapsed_from(2)); + + // Initialise object to handle stored transposed tensors in auxillary memory + CpuAuxTensorHandler lhs_transposed(offset_int_vec(TransposeLHS), _lhs_transposed, tensors, true); + CpuAuxTensorHandler rhs_transposed(offset_int_vec(TransposeRHS), _rhs_transposed, tensors, true); + + // Create tensor pack for asm kernel + ITensorPack asm_tensors(tensors); + + // Run transpose lhs if necessary + if (_adj_lhs) + { + ITensorPack lhs_transpose_pack = {{TensorType::ACL_SRC, lhs}, {TensorType::ACL_DST, lhs_transposed.get()}}; + NEScheduler::get().schedule_op(_transpose_kernel_lhs.get(), Window::DimY, _transpose_kernel_lhs->window(), + lhs_transpose_pack); + asm_tensors.add_const_tensor(TensorType::ACL_SRC_0, lhs_transposed.get()); + } + // Run transpose rhs if necessary + if (_adj_rhs) + { + ITensorPack rhs_transpose_pack = {{TensorType::ACL_SRC, rhs}, {TensorType::ACL_DST, rhs_transposed.get()}}; + NEScheduler::get().schedule_op(_transpose_kernel_rhs.get(), Window::DimY, _transpose_kernel_rhs->window(), + rhs_transpose_pack); + asm_tensors.add_const_tensor(TensorType::ACL_SRC_1, rhs_transposed.get()); + } + // Run asm kernel + _asm_glue->run(asm_tensors); + + // Undo reshape of tensors + dst->info()->set_tensor_shape(_original_dst_shape); + lhs->info()->set_tensor_shape(_original_lhs_shape); + rhs->info()->set_tensor_shape(_original_rhs_shape); +} + +experimental::MemoryRequirements CpuMatMul::workspace() const +{ + return _aux_mem; +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuMatMul.h b/src/cpu/operators/CpuMatMul.h new file mode 100644 index 0000000000..2b1b4cf0ff --- /dev/null +++ b/src/cpu/operators/CpuMatMul.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_CPUMATMUL_H +#define ACL_SRC_CPU_OPERATORS_CPUMATMUL_H + +#include "arm_compute/core/TensorInfo.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuOperator.h" +#include "src/cpu/kernels/CpuTransposeKernel.h" +#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h" + +namespace arm_compute +{ +// Forward Declarations +class MatMulInfo; +class CpuMatMulSettings; + +namespace cpu +{ +/** Function to execute MatMul Operation. This function calls the following functions/kernels: + * + * If adjoint/adj flag is enabled for either input lhs or rhs (or both) : + * -# @ref cpu::kernels::CpuTransposeKernel + * Then : + * -# @ref cpu::CpuGemmAssemblyDispatch + */ +class CpuMatMul : public ICpuOperator +{ +public: + /* Constructor */ + CpuMatMul(); + /* Destructor */ + ~CpuMatMul() = default; + + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuMatMul); + /** Configure operator for a given list of arguments + * + * Note: Check documentation of @ref NEMatMul for a list of supported datatypes and layouts + * + * + * @param[in] lhs Left-hand side tensor info. + * @param[in] rhs Right-hand side tensor info. + * @param[out] dst Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs. + * @param[in] info Contains MatMul operation information described in @ref MatMulInfo. + * @param[in] settings The settings for matmul operation (i.e fast math) + * @param[in] act_info Class containing information about fused activation function. + */ + void configure(ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuMatMul::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulInfo &info, + const CpuMatMulSettings &settings, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + enum InternalTensorIdx + { + /* Slots 0 - 2 reserved for CpuGemmAssemblyDispatch */ + TransposeLHS = 3, + TransposeRHS, + Count + }; + + // Define unique pointers to kernels/operators used by matmul + std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_lhs{nullptr}; + std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_rhs{nullptr}; + std::unique_ptr<CpuGemmAssemblyDispatch> _asm_glue{nullptr}; + + // TensorInfo for tensors stored in auxillary memory + TensorInfo _lhs_transposed{}; + TensorInfo _rhs_transposed{}; + + // Original tensor shapes prior to reshaping tensors and collapsing dimensions + TensorShape _original_lhs_shape{}; + TensorShape _original_rhs_shape{}; + TensorShape _original_dst_shape{}; + + // Note : adj_lhs means the same as transposing lhs + bool _adj_lhs{false}; + bool _adj_rhs{false}; + bool _fast_math{false}; + AsmGemmInfo _gemm_info{}; + experimental::MemoryRequirements _aux_mem{Count}; +}; +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_OPERATORS_CPUMATMUL_H diff --git a/src/cpu/operators/CpuMaxUnpooling.cpp b/src/cpu/operators/CpuMaxUnpooling.cpp new file mode 100644 index 0000000000..697fc40ab3 --- /dev/null +++ b/src/cpu/operators/CpuMaxUnpooling.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2018-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuMaxUnpooling.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuMaxUnpooling::configure(const ITensorInfo *src, + const ITensorInfo *indices, + ITensorInfo *dst, + const PoolingLayerInfo &pool_info) +{ + ARM_COMPUTE_LOG_PARAMS(src, indices, dst, pool_info); + auto k = std::make_unique<kernels::CpuMaxUnpoolingLayerKernel>(); + k->configure(src, indices, dst, pool_info); + _kernel = std::move(k); +} + +Status CpuMaxUnpooling::validate(const ITensorInfo *src, + const ITensorInfo *indices, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info) +{ + return kernels::CpuMaxUnpoolingLayerKernel::validate(src, indices, dst, pool_info); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuMaxUnpooling.h b/src/cpu/operators/CpuMaxUnpooling.h new file mode 100644 index 0000000000..5dc00bce9e --- /dev/null +++ b/src/cpu/operators/CpuMaxUnpooling.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_MAXUNPOOLING_H +#define ARM_COMPUTE_CPU_MAXUNPOOLING_H + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuMaxUnpoolingLayerKernel */ +class CpuMaxUnpooling : public ICpuOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @param[in] src Source tensor to permute. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] indices Tensor containing the offset to store the src elements in the dst tensor. + * @ref CpuMaxUnpooling with indices should precede this function in order to + * properly reconstruct the output tensor. + * The tensor shape of this tensor has to be equal to the src tensor shape. Data type supported: U32. + * @param[out] dst Destination tensor. Data types supported: Same as @p src + * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. + */ + void + configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuMaxUnpooling::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *indices, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info); +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_MAXUNPOOLING_H */ diff --git a/src/cpu/operators/CpuMul.cpp b/src/cpu/operators/CpuMul.cpp new file mode 100644 index 0000000000..ac9847111d --- /dev/null +++ b/src/cpu/operators/CpuMul.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2016-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuMul.h" + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuMulKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +Status CpuMul::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); + return kernels::CpuMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy); +} + +void CpuMul::configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(act_info); + ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info); + + auto k = std::make_unique<kernels::CpuMulKernel>(); + k->configure(src1, src2, dst, scale, overflow_policy, rounding_policy); + _kernel = std::move(k); +} + +void CpuMul::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + auto split_dimension = static_cast<kernels::CpuMulKernel *>(_kernel.get())->get_split_dimension_hint(); + NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); +} + +Status CpuComplexMul::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); + return kernels::CpuComplexMulKernel::validate(src1, src2, dst); +} + +void CpuComplexMul::configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(act_info); + ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); + + auto k = std::make_unique<kernels::CpuComplexMulKernel>(); + k->configure(src1, src2, dst); + _kernel = std::move(k); +} + +void CpuComplexMul::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuMul.h b/src/cpu/operators/CpuMul.h new file mode 100644 index 0000000000..82b309830b --- /dev/null +++ b/src/cpu/operators/CpuMul.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2016-2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_MUL_H +#define ARM_COMPUTE_CPU_MUL_H + +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuMulKernel */ +class CpuMul : public ICpuOperator +{ +public: + /** Initialise the kernel's inputs, dst and convertion policy. + * + * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. + * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. + * + * @param[in, out] src1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32 + * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] src2 Second input tensor info. Data types supported: U8, QASYMM8 (only if @p src1 is QASYMM8), QASYMM8_SIGNED (only if @p src1 is QASYMM8_SIGNED), S16, S32, QSYMM16 (only if @p src1 is QSYMM16), F16 (only if @p src1 is F16), F32 (only if @p src1 is F32). + * This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] dst dst tensor info. Data types supported: + * - U8, only if both inputs are U8. + * - QASYMM8, only if both inputs are QASYMM8. + * - QASYMM8_SIGNED, only if @p src1 is QASYMM8_SIGNED. + * - S16. + * - QSYMM16, only if both inputs are QSYMM16. + * - S32, only if both inputs are S32 or both are QSYMM16. + * - F16, only if @p src1 is F16. + * - F32, only if both inputs are F32. + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. + * If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255 + * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype + * @param[in] rounding_policy Rounding policy. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. + */ + void configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuMul::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; + +/** Basic function to run @ref kernels::CpuComplexMulKernel */ +class CpuComplexMul : public ICpuOperator +{ +public: + /** Initialise the kernel's inputs, dst. + * + * @param[in, out] src1 First input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor). + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] src2 Second input tensor. Data types supported: same as @p src1. Number of channels supported: same as @p src1. + * The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] dst The dst tensor. Data types supported: same as @p src1. Number of channels: same as @p src1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. + */ + void configure(ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuComplexMul::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_MUL_H */ diff --git a/src/cpu/operators/CpuPRelu.h b/src/cpu/operators/CpuPRelu.h new file mode 100644 index 0000000000..084474e2ba --- /dev/null +++ b/src/cpu/operators/CpuPRelu.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_PRELU_H +#define ARM_COMPUTE_CPU_PRELU_H + +#include "src/cpu/operators/CpuElementwise.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Class to run @ref cpu::kernels::CpuArithmeticKernel except for PRelu operation */ +using CpuPRelu = CpuElementwiseArithmetic<ArithmeticOperation::PRELU>; +} // namespace cpu +} // namespace arm_compute + +#endif /* ARM_COMPUTE_CPU_PRELU_H */
\ No newline at end of file diff --git a/src/cpu/operators/CpuPermute.cpp b/src/cpu/operators/CpuPermute.cpp new file mode 100644 index 0000000000..25acc92d00 --- /dev/null +++ b/src/cpu/operators/CpuPermute.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuPermute.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuPermuteKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuPermute::configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst, perm); + auto k = std::make_unique<kernels::CpuPermuteKernel>(); + k->configure(src, dst, perm); + _kernel = std::move(k); +} + +Status CpuPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm) +{ + return kernels::CpuPermuteKernel::validate(src, dst, perm); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuPermute.h b/src/cpu/operators/CpuPermute.h new file mode 100644 index 0000000000..0e0f3ae8db --- /dev/null +++ b/src/cpu/operators/CpuPermute.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_PERMUTE_H +#define ARM_COMPUTE_CPU_PERMUTE_H + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuPermuteKernel */ +class CpuPermute : public ICpuOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @note Arbitrary permutation vectors are supported with rank not greater than 4 + * + * @param[in] src Source tensor to permute. Data types supported: All + * @param[out] dst Destintation tensor. Data types supported: Same as @p src + * @param[in] perm Permutation vector + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuPermute::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm); +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_PERMUTE_H */ diff --git a/src/cpu/operators/CpuPool2d.cpp b/src/cpu/operators/CpuPool2d.cpp new file mode 100644 index 0000000000..b72bde6978 --- /dev/null +++ b/src/cpu/operators/CpuPool2d.cpp @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuPool2d.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuPool2dKernel.h" +#include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h" + +using namespace arm_compute::experimental; + +namespace arm_compute +{ +namespace cpu +{ +CpuPool2d::CpuPool2d() + : _pooling_layer_kernel(), + _asm_glue(), + _is_global_pooling_layer(false), + _use_kernel_indices(false), + _data_layout(DataLayout::NCHW), + _aux_mem(1) +{ +} + +CpuPool2d::~CpuPool2d() = default; + +void CpuPool2d::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst, pool_info, indices); + + // Check if we can run assembly kernels. Currently, indices are not supported by those kernels + const bool run_optimised = + bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); + + // Get data layout + _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + + // Check if we have Global Pooling Layer + const unsigned int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const unsigned int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + _is_global_pooling_layer = (src->dimension(idx_width) == pool_info.pool_size.width) && + (src->dimension(idx_height) == pool_info.pool_size.height); + _use_kernel_indices = pool_info.use_kernel_indices; + + if (run_optimised) + { + const CPUInfo &ci = NEScheduler::get().cpu_info(); + const unsigned int num_threads = NEScheduler::get().num_threads(); + + auto pooling_wrapper = std::make_unique<kernels::CpuPool2dAssemblyWrapperKernel>(); + ARM_COMPUTE_ERROR_ON(pooling_wrapper == nullptr); + pooling_wrapper->configure(src, dst, pool_info, ci); + + // Get kernel's memory requirements + constexpr size_t alignment = 4096; + const size_t workspace_size = pooling_wrapper->get_working_size(num_threads); + _aux_mem[0] = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment); + + _asm_glue = std::move(pooling_wrapper); + } + else + { + // Configure pooling kernel + auto k = std::make_unique<kernels::CpuPool2dKernel>(); + k->configure(src, dst, pool_info, indices); + _pooling_layer_kernel = std::move(k); + } +} + +Status CpuPool2d::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) +{ + const bool run_optimised = + bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr); + + if (run_optimised) + { + return Status{}; + } + + return kernels::CpuPool2dKernel::validate(src, dst, pool_info, indices); +} + +void CpuPool2d::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided"); + + if (_asm_glue) + { + const auto hints = (_is_global_pooling_layer) ? Window::DimX : Window::DimY; + NEScheduler::get().schedule_op(_asm_glue.get(), hints, _asm_glue->window(), tensors); + } + else + { + switch (_data_layout) + { + case DataLayout::NCHW: + NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), + _is_global_pooling_layer ? Window::DimZ : Window::DimY, + _pooling_layer_kernel->window(), tensors); + break; + case DataLayout::NHWC: + NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), + (_use_kernel_indices ? Window::DimY : Window::DimX), + _pooling_layer_kernel->window(), tensors); + break; + default: + ARM_COMPUTE_ERROR("Data layout not supported"); + } + } +} + +experimental::MemoryRequirements CpuPool2d::workspace() const +{ + return _aux_mem; +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuPool2d.h b/src/cpu/operators/CpuPool2d.h new file mode 100644 index 0000000000..ea73e3f335 --- /dev/null +++ b/src/cpu/operators/CpuPool2d.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_POOL2D_H +#define ARM_COMPUTE_CPU_POOL2D_H + +#include "arm_compute/core/experimental/Types.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuOperator.h" + +#include <memory> + +namespace arm_compute +{ +// Forward Declarations +struct PoolingLayerInfo; + +namespace cpu +{ +/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels: + * + * -# @ref NEFillBorderKernel (executed if padding size is different from zero) + * -# @ref kernels::CpuPool2dKernel + * -# @ref kernels::CpuPool2dAssemblyWrapperKernel + */ +class CpuPool2d : public ICpuOperator +{ +public: + CpuPool2d(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2d); + ~CpuPool2d(); + /** Set the src and dst tensors. + * + * @note F16 is supported for pool sizes 2 and 3 only + * + * @param[in, out] src Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. + * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32. + */ + void + configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuPool2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices = nullptr); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + std::unique_ptr<INEKernel> _pooling_layer_kernel; + std::unique_ptr<INEKernel> _asm_glue; + + bool _is_global_pooling_layer; + bool _use_kernel_indices; + DataLayout _data_layout; + experimental::MemoryRequirements _aux_mem{}; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_POOL2D_H */ diff --git a/src/cpu/operators/CpuPool3d.cpp b/src/cpu/operators/CpuPool3d.cpp new file mode 100644 index 0000000000..7fa78c1f80 --- /dev/null +++ b/src/cpu/operators/CpuPool3d.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuPool3d.h" + +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/runtime/Scheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuPool3dKernel.h" + +using namespace arm_compute::experimental; + +namespace arm_compute +{ +namespace cpu +{ +CpuPool3d::CpuPool3d() : _aux_mem(1) +{ +} + +CpuPool3d::~CpuPool3d() = default; + +void CpuPool3d::configure(const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst, pool_info); + + // Configure pooling kernel + auto k = std::make_unique<kernels::CpuPool3dKernel>(); + k->configure(src, dst, pool_info); + _kernel = std::move(k); +} + +Status CpuPool3d::validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info) +{ + return kernels::CpuPool3dKernel::validate(src, dst, pool_info); +} + +void CpuPool3d::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided"); + + Scheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); +} + +experimental::MemoryRequirements CpuPool3d::workspace() const +{ + return _aux_mem; +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuPool3d.h b/src/cpu/operators/CpuPool3d.h new file mode 100644 index 0000000000..235d798095 --- /dev/null +++ b/src/cpu/operators/CpuPool3d.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_POOL3D_H +#define ARM_COMPUTE_CPU_POOL3D_H + +#include "arm_compute/core/experimental/Types.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuOperator.h" + +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels: + * + * -# @ref kernels::CpuPool3dKernel + */ +class CpuPool3d : public ICpuOperator +{ +public: + CpuPool3d(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool3d); + ~CpuPool3d(); + /** Set the src and dst tensors. + * + * + * @param[in] src Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + * @param[in] pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo. + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to CpuPool3d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + experimental::MemoryRequirements _aux_mem{}; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_POOL3D_H */ diff --git a/src/cpu/operators/CpuQuantize.cpp b/src/cpu/operators/CpuQuantize.cpp new file mode 100644 index 0000000000..4a3f1827c7 --- /dev/null +++ b/src/cpu/operators/CpuQuantize.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2021, 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/operators/CpuQuantize.h" + +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuQuantizeKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +Status CpuQuantize::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuQuantizeKernel::validate(src, dst)); + return Status{}; +} + +void CpuQuantize::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_LOG_PARAMS(src, dst); + + // Configure quantize kernel + auto k = std::make_unique<kernels::CpuQuantizeKernel>(); + k->configure(src, dst); + _kernel = std::move(k); +} + +void CpuQuantize::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + auto split_dimension = static_cast<kernels::CpuQuantizeKernel *>(_kernel.get())->get_split_dimension_hint(); + NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuQuantize.h b/src/cpu/operators/CpuQuantize.h new file mode 100644 index 0000000000..ec1134fee4 --- /dev/null +++ b/src/cpu/operators/CpuQuantize.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_QUANTIZE_H +#define ARM_COMPUTE_CPU_QUANTIZE_H + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuQuantizeKernel that dequantizes an input tensor */ +class CpuQuantize : public ICpuOperator +{ +public: + /** Set the input and output tensors. + * + * @param[in] src Source tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16. + * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16 + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuQuantize::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_QUANTIZE_H */ diff --git a/src/cpu/operators/CpuReshape.cpp b/src/cpu/operators/CpuReshape.cpp new file mode 100644 index 0000000000..a423abb49a --- /dev/null +++ b/src/cpu/operators/CpuReshape.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuReshape.h" + +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuReshapeKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuReshape::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::CpuReshapeKernel>(); + k->configure(src, dst); + _kernel = std::move(k); +} + +Status CpuReshape::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::CpuReshapeKernel::validate(src, dst); +} + +void CpuReshape::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + if (!_is_prepared) + { + static_cast<kernels::CpuReshapeKernel *>(_kernel.get())->prepare(tensors); + _is_prepared = true; + } + const auto split_dimension = static_cast<kernels::CpuReshapeKernel *>(_kernel.get())->get_split_dimension(); + NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuReshape.h b/src/cpu/operators/CpuReshape.h new file mode 100644 index 0000000000..33da792319 --- /dev/null +++ b/src/cpu/operators/CpuReshape.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_RESHAPE_H +#define ARM_COMPUTE_CPU_RESHAPE_H + +#include "arm_compute/core/Window.h" + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuReshapeKernel */ +class CpuReshape : public ICpuOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @param[in] src Source tensor info. Data type supported: All + * @param[out] dst Destination info. Data type supported: Same as @p src + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuReshape::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + +private: + bool _is_prepared{false}; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_RESHAPE_H */ diff --git a/src/cpu/operators/CpuScale.cpp b/src/cpu/operators/CpuScale.cpp new file mode 100644 index 0000000000..7df9296931 --- /dev/null +++ b/src/cpu/operators/CpuScale.cpp @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuScale.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/utils/ScaleUtils.h" +#include "src/cpu/kernels/CpuScaleKernel.h" +#include "support/Rounding.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +void precompute_dx_dy_offsets( + ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners) +{ + ARM_COMPUTE_ERROR_ON(offsets == nullptr); + float sampling_offset = 0.0f; + if (sampling_policy == SamplingPolicy::CENTER) + { + sampling_offset = 0.5f; + } + + Window win; + win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1)); + win.set(Window::DimY, Window::Dimension(0, offsets->info()->dimension(1), 1)); + + if (dx != nullptr && dy != nullptr) + { + // Pre-compute the offset and pixel's distance for BILINEAR interpolation + Iterator offsets_it(offsets, win); + Iterator dx_it(dx, win); + Iterator dy_it(dy, win); + + execute_window_loop( + win, + [&](const Coordinates &id) + { + const float in_x = (id.x() + sampling_offset) * wr - sampling_offset; + const float in_y = (id.y() + sampling_offset) * hr - sampling_offset; + const int in_xi = std::floor(in_x); + const int in_yi = std::floor(in_y); + + *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi; + *reinterpret_cast<float *>(dx_it.ptr()) = in_x - in_xi; + *reinterpret_cast<float *>(dy_it.ptr()) = in_y - in_yi; + }, + offsets_it, dx_it, dy_it); + } + else + { + // Pre-compute the offset for NEAREST interpolation + Iterator offsets_it(offsets, win); + + execute_window_loop( + win, + [&](const Coordinates &id) + { + const float float_in_xi = (id.x() + sampling_offset) * wr; + const auto in_xi = static_cast<size_t>( + align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) + : std::floor(float_in_xi)); + *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi; + }, + offsets_it); + } +} +} // namespace + +void CpuScale::configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(CpuScale::validate(src, dst, info)); + ARM_COMPUTE_LOG_PARAMS(src, dst, info); + + _scale_info = info; + _is_prepared = false; + + // Get data layout and width/height indices + _data_layout = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout; + const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + + // Compute the ratio between source width/height and destination width/height + const bool is_align_corners_used = + _scale_info.align_corners && + arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy); + const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), + dst->dimension(idx_width), is_align_corners_used); + const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), + dst->dimension(idx_height), is_align_corners_used); + + // Area interpolation behaves as Nearest Neighbour in case of up-sampling + InterpolationPolicy policy_to_use = + (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + ? InterpolationPolicy::NEAREST_NEIGHBOR + : _scale_info.interpolation_policy; + + // Get the tensor shape + TensorShape shape(dst->dimension(idx_width)); + shape.set(1, dst->dimension(idx_height), false); + + TensorInfo tensor_info_offsets(shape, Format::S32); + TensorInfo tensor_info_dxdy(shape, Format::F32); + + auto dx = std::make_unique<TensorInfo>(tensor_info_dxdy); + auto dy = std::make_unique<TensorInfo>(tensor_info_dxdy); + auto offsets = std::make_unique<TensorInfo>(tensor_info_offsets); + auto scale_kernel = std::make_unique<kernels::CpuScaleKernel>(); + switch (policy_to_use) + { + case InterpolationPolicy::NEAREST_NEIGHBOR: + { + scale_kernel->configure(src, nullptr, nullptr, offsets.get(), dst, info); + break; + } + case InterpolationPolicy::BILINEAR: + { + scale_kernel->configure(src, dx.get(), dy.get(), offsets.get(), dst, info); + break; + } + case InterpolationPolicy::AREA: + { + scale_kernel->configure(src, nullptr, nullptr, nullptr, dst, info); + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported interpolation mode"); + } + _kernel = std::move(scale_kernel); +} + +Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && + info.sampling_policy != SamplingPolicy::TOP_LEFT); + + ITensorInfo *offsets = nullptr; + ITensorInfo *dx = nullptr; + ITensorInfo *dy = nullptr; + + // Get data layout and width/height indices + const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + // Compute the ratio between source width/height and destination width/height + const bool is_align_corners_used = + info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy); + const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), + dst->dimension(idx_width), is_align_corners_used); + const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), + dst->dimension(idx_height), is_align_corners_used); + + // Area interpolation behaves as Nearest Neighbour in case of up-sampling + InterpolationPolicy policy_to_use = + (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + ? InterpolationPolicy::NEAREST_NEIGHBOR + : info.interpolation_policy; + + // Get the tensor shape of auxilary buffers + const TensorShape shape(dst->dimension(idx_width), dst->dimension(idx_height)); + TensorInfo tensor_info_offsets(shape, Format::S32); + TensorInfo tensor_info_dx(shape, Format::F32); + TensorInfo tensor_info_dy(shape, Format::F32); + switch (policy_to_use) + { + case InterpolationPolicy::NEAREST_NEIGHBOR: + offsets = &tensor_info_offsets; + break; + case InterpolationPolicy::BILINEAR: + offsets = &tensor_info_offsets; + dx = &tensor_info_dx; + dy = &tensor_info_dy; + break; + default: + break; + } + + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info)); + return Status{}; +} + +void CpuScale::prepare(ITensorPack &tensors) +{ + if (!_is_prepared) + { + _is_prepared = true; + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + auto dx = tensors.get_tensor(TensorType::ACL_INT_0); + auto dy = tensors.get_tensor(TensorType::ACL_INT_1); + auto offsets = tensors.get_tensor(TensorType::ACL_INT_2); + + // Get data layout and width/height indices + const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + + // Compute the ratio between source width/height and destination width/height + const bool is_align_corners_used = + _scale_info.align_corners && + arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy); + const auto wr = arm_compute::scale_utils::calculate_resize_ratio( + src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used); + const auto hr = arm_compute::scale_utils::calculate_resize_ratio( + src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used); + + // Area interpolation behaves as Nearest Neighbour in case of up-sampling + InterpolationPolicy policy_to_use = + (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + ? InterpolationPolicy::NEAREST_NEIGHBOR + : _scale_info.interpolation_policy; + const SamplingPolicy sampling_policy = _scale_info.sampling_policy; + + bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required( + _data_layout, src->info()->data_type(), policy_to_use, _scale_info.border_mode); + + if (precompute_indices_weights) + { + switch (policy_to_use) + { + case InterpolationPolicy::NEAREST_NEIGHBOR: + { + // Pre-compute offsets for nearest interpolation + precompute_dx_dy_offsets(nullptr, nullptr, offsets, wr, hr, sampling_policy, is_align_corners_used); + break; + } + case InterpolationPolicy::BILINEAR: + { + // Pre-compute dx, dy and offsets for bilinear interpolation + precompute_dx_dy_offsets(dx, dy, offsets, wr, hr, sampling_policy, is_align_corners_used); + break; + } + case InterpolationPolicy::AREA: + { + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported interpolation mode"); + } + } + else + { + if (policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && + policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA) + { + ARM_COMPUTE_ERROR("Unsupported interpolation mode"); + } + } + } +} + +void CpuScale::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + prepare(tensors); + NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuScale.h b/src/cpu/operators/CpuScale.h new file mode 100644 index 0000000000..c12a8e733a --- /dev/null +++ b/src/cpu/operators/CpuScale.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_SCALE_H +#define ARM_COMPUTE_CPU_SCALE_H + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/core/KernelDescriptors.h" + +#include "src/cpu/ICpuKernel.h" +#include "src/cpu/ICpuOperator.h" + +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to compute Scale */ +class CpuScale : public ICpuOperator +{ +public: + /** Initialize the function's source, destination, interpolation type and border_mode. + * + * @param[in, out] src Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED) + * @param[out] dst Destination tensor info. Data type supported: Same as @p src. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. + * @param[in] info @ref ScaleKernelInfo to be used for configuration + * + * @note Using S8 data type only supports NHWC, @p border_mode Replicate, and @p policy Bilinear + */ + void configure(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuScale::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info); + + // Inherited methods overridden: + void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + +private: + ScaleKernelInfo _scale_info{InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED}; + DataLayout _data_layout{DataLayout::UNKNOWN}; + bool _is_prepared{false}; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_SCALE_H */ diff --git a/src/cpu/operators/CpuSoftmax.cpp b/src/cpu/operators/CpuSoftmax.cpp new file mode 100644 index 0000000000..fecee7d765 --- /dev/null +++ b/src/cpu/operators/CpuSoftmax.cpp @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2021, 2023-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuSoftmax.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/helpers/SoftmaxHelpers.h" +#include "src/cpu/kernels/CpuSoftmaxKernel.h" +#include "src/cpu/utils/CpuAuxTensorHandler.h" + +using namespace arm_compute::experimental; + +namespace arm_compute +{ +namespace cpu +{ +CpuSoftmaxGeneric::CpuSoftmaxGeneric() : _softmax_kernel(), _tmp(), _aux_mem(InternalTensorIdx::COUNT) +{ +} + +void CpuSoftmaxGeneric::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis, bool is_log) +{ + // Perform validation step + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(CpuSoftmaxGeneric::validate(src, dst, beta, axis)); + ARM_COMPUTE_LOG_PARAMS(src, dst, beta, axis); + + const unsigned int actual_axis = + static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions()))); + + _axis = actual_axis; + + const ITensorInfo *tmp_input = src; + + TensorInfo tensor_info_tmp; + if (is_data_type_quantized_asymmetric(src->data_type())) + { + // Create intermediate tensors shapes + const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true); + tensor_info_tmp = input_info.clone()->set_data_type(DataType::F32); + } + + // Init intermediate tensors + _tmp = TensorInfo(tensor_info_tmp); + + // Configure kernels + auto sm = std::make_unique<kernels::CpuSoftmaxKernel>(); + + // Softmax 2D case + sm->configure(tmp_input, dst, beta, is_log, actual_axis, &_tmp); + + _softmax_kernel = std::move(sm); + + if (_tmp.total_size() > 0) + { + _aux_mem[InternalTensorIdx::TMP] = + MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size()); + } +} + +Status +CpuSoftmaxGeneric::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis, bool is_log) +{ + // Perform validation step + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported"); + ARM_COMPUTE_UNUSED(beta); + ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-src->num_dimensions()) || + static_cast<int32_t>(src->num_dimensions()) <= axis); + + // Create intermediate tensor info + TensorInfo tensor_info_tmp; + + if (is_data_type_quantized_asymmetric(src->data_type())) + { + tensor_info_tmp = src->clone()->set_data_type(DataType::F32).set_is_resizable(true); + } + const unsigned int actual_axis = + static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions()))); + + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::CpuSoftmaxKernel::validate(src, dst, beta, actual_axis, is_log, &tensor_info_tmp)); + + return Status{}; +} + +void CpuSoftmaxGeneric::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + CpuAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp, tensors, true); + + ITensorPack softmax_pack; + + softmax_pack = {{TensorType::ACL_SRC_0, src}, {TensorType::ACL_DST_0, dst}, {TensorType::ACL_DST_1, tmp.get()}}; + + if (_axis == 0) + { + NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack); + } + else + { + NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimX, _softmax_kernel->window(), softmax_pack); + } +} + +experimental::MemoryRequirements CpuSoftmaxGeneric::workspace() const +{ + return _aux_mem; +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuSoftmax.h b/src/cpu/operators/CpuSoftmax.h new file mode 100644 index 0000000000..6ba3476eff --- /dev/null +++ b/src/cpu/operators/CpuSoftmax.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2021-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H +#define ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/cpu/ICpuKernel.h" +#include "src/cpu/ICpuOperator.h" +#include "src/cpu/operators/CpuPermute.h" + +#include <memory> + +namespace arm_compute +{ +namespace cpu +{ +class CpuSoftmaxKernel; + +/** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer. + * + * Softmax is calculated by : + * @f[ out = exp((x - max(x)) * beta) / sum(exp((x - max(x)) * beta)) @f] + * + * Log Softmax is calculated by : + * @f[ out = (x - max(x) * beta) - log(\sum{e^{x - max(x) * beta}}) @f] + * + * This function runs the following function/kernels: + * -# If axis is not 0: + * -# @ref CpuPermute + * -# @ref kernels::CpuSoftmaxKernel + */ +class CpuSoftmaxGeneric : public ICpuOperator +{ +public: + CpuSoftmaxGeneric(); + /** Set the input and output tensors. + * + * @param[in,out] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * last value of each row to the nearest multiple. + * @param[out] dst Destination tensor ifo. Data types supported: same as @p input. + * @param[in] beta (Optional) A scaling factor for the exponent. + * @param[in] axis (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and + * axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0 + * @param[in] is_log True if the operation is log-softmax + */ + void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuSoftmaxGeneric::configure() + * + * @return a status + */ + static Status + validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + enum InternalTensorIdx + { + TMP = 0, + PERMUTED_SRC, + PERMUTED_DST, + COUNT + }; + + std::unique_ptr<ICPPKernel> _softmax_kernel; + + TensorInfo _tmp; + + experimental::MemoryRequirements _aux_mem{}; + + unsigned int _axis = 0; +}; + +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_OPERATORS_CPUSOFTMAX_H diff --git a/src/cpu/operators/CpuSub.cpp b/src/cpu/operators/CpuSub.cpp new file mode 100644 index 0000000000..7d27efbc96 --- /dev/null +++ b/src/cpu/operators/CpuSub.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuSub.h" + +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuSubKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuSub::configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(act_info); + ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy); + auto k = std::make_unique<kernels::CpuSubKernel>(); + k->configure(src0, src1, dst, policy); + _kernel = std::move(k); +} + +Status CpuSub::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled()); + return kernels::CpuSubKernel::validate(src0, src1, dst, policy); +} + +void CpuSub::run(ITensorPack &tensors) +{ + const auto split_dimension = static_cast<kernels::CpuSubKernel *>(_kernel.get())->get_split_dimension(); + + NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuSub.h b/src/cpu/operators/CpuSub.h new file mode 100644 index 0000000000..d1782a1d3c --- /dev/null +++ b/src/cpu/operators/CpuSub.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_SUB_H +#define ARM_COMPUTE_CPU_SUB_H + +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuSubKernel */ +class CpuSub : public ICpuOperator +{ +public: + /** Initialise the kernel's inputs, dst and conversion policy. + * + * Valid configurations (src0,src1) -> dst : + * + * - (U8,U8) -> U8 + * - (QASYMM8, QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (S16,S16) -> S16 + * - (S32,S32) -> S32 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * + * @param[in] src0 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 + * @param[in] src1 Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 + * @param[out] dst Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32 + * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported. + */ + void configure(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuSub::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_SUB_H */ diff --git a/src/cpu/operators/CpuTranspose.cpp b/src/cpu/operators/CpuTranspose.cpp new file mode 100644 index 0000000000..ea548e0511 --- /dev/null +++ b/src/cpu/operators/CpuTranspose.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuTranspose.h" + +#include "src/common/utils/Log.h" +#include "src/cpu/kernels/CpuTransposeKernel.h" + +namespace arm_compute +{ +namespace cpu +{ +void CpuTranspose::configure(const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::CpuTransposeKernel>(); + k->configure(src, dst); + _kernel = std::move(k); +} + +Status CpuTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::CpuTransposeKernel::validate(src, dst); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuTranspose.h b/src/cpu/operators/CpuTranspose.h new file mode 100644 index 0000000000..8934481ef6 --- /dev/null +++ b/src/cpu/operators/CpuTranspose.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CPU_TRANSPOSE_H +#define ARM_COMPUTE_CPU_TRANSPOSE_H + +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/** Basic function to run @ref kernels::CpuTransposeKernel */ +class CpuTranspose : public ICpuOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @param[in] src Source tensor to permute. Data types supported: All + * @param[out] dst Destintation tensor. Data types supported: Same as @p src + */ + void configure(const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref CpuTranspose::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; +} // namespace cpu +} // namespace arm_compute +#endif /* ARM_COMPUTE_CPU_TRANSPOSE_H */ diff --git a/src/cpu/operators/CpuWinogradConv2d.cpp b/src/cpu/operators/CpuWinogradConv2d.cpp new file mode 100644 index 0000000000..7d81aee0e9 --- /dev/null +++ b/src/cpu/operators/CpuWinogradConv2d.cpp @@ -0,0 +1,478 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/CpuWinogradConv2d.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/FunctionDescriptors.h" +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/assembly/winograd.hpp" +#include "src/core/NEON/kernels/convolution/common/tensor.hpp" +#include "src/core/NEON/kernels/convolution/common/utils.hpp" +#include "src/core/utils/AssemblyUtils.h" +#include "src/cpu/kernels/assembly/arm_gemm.hpp" +#include "src/cpu/kernels/CpuWinogradConv2dKernel.h" +#include "src/cpu/operators/CpuActivation.h" +#include "src/cpu/operators/CpuPermute.h" +#include "src/cpu/utils/CpuAuxTensorHandler.h" +#include "support/Cast.h" + +namespace arm_compute +{ +namespace cpu +{ +using namespace arm_compute::experimental; +using namespace arm_compute::utils::cast; + +namespace +{ +inline Tensor4DShape internal_get_shape(const ITensorInfo *in) +{ + const DataLayout data_layout = in->data_layout(); + const int in_width = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)); + const int in_height = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)); + const int in_channels = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)); + const int in_batches = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES)); + + return Tensor4DShape{in_batches, in_height, in_width, in_channels}; +} + +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_UNUSED(dst, weights); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, + "Winograd layer only supports unit strides."); + if (biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); + } + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); + return Status{}; +} + +bool get_winograd_kernel_implementation(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math, + arm_conv::winograd::WinogradImpl *winograd_impl, + std::unique_ptr<arm_conv::ConvolutionArgs> &conv_args) +{ + arm_conv::winograd::WinogradConfig winograd_cfg; + arm_gemm::GemmConfig cfg; + + const DataType data_type = src->data_type(); + Tensor4DShape in_shape{internal_get_shape(src)}; + Tensor4DShape out_shape{internal_get_shape(dst)}; + Tensor4DShape kernel_shape{internal_get_shape(weights)}; + uint32_t nthreads = NEScheduler::get().num_threads(); + // Get configuration arguments for Winograd + winograd_cfg.output_rows = 0; + winograd_cfg.output_cols = 0; + conv_args = std::make_unique<arm_conv::ConvolutionArgs>( + in_shape.n_batches, + arm_conv::Shape2D{static_cast<uint32_t>(in_shape.n_rows), static_cast<uint32_t>(in_shape.n_cols)}, + in_shape.n_channels, conv_info.pad_top(), conv_info.pad_left(), + arm_conv::Shape2D{static_cast<uint32_t>(out_shape.n_rows), static_cast<uint32_t>(out_shape.n_cols)}, + out_shape.n_channels, + arm_conv::Shape2D{static_cast<uint32_t>(kernel_shape.n_rows), static_cast<uint32_t>(kernel_shape.n_cols)}, + assembly_utils::map_to_arm_gemm_activation(act_info)); + + bool success = false; + if (data_type == DataType::F32) + { + success = arm_conv::winograd::get_implementation<float>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads, + enable_fast_math, &winograd_cfg, nullptr); + } +#if defined(__aarch64__) && defined(ENABLE_FP16_KERNELS) + else if (data_type == DataType::F16) + { + success = arm_conv::winograd::get_implementation<__fp16>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads, + enable_fast_math, &winograd_cfg, nullptr); + } +#endif // defined(__aarch64__) && defined(ENABLE_FP16_KERNELS) + else + { + success = false; + } + return success; +} +inline bool fuse_function_supported(const ActivationLayerInfo &act_info) +{ + return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || + act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU; +} +} // namespace + +CpuWinogradConv2d::CpuWinogradConv2d() + + : _gemm_function(std::make_unique<CpuGemm>()), + _activation_func(std::make_unique<CpuActivation>()), + _transform_input_kernel(nullptr), + _transform_output_kernel(nullptr), + _permute_input(std::make_unique<CpuPermute>()), + _permute_output(std::make_unique<CpuPermute>()), + _permute_weights(std::make_unique<CpuPermute>()), + _aux_mem(AuxTensorIdx::Count), + _conv_args{nullptr}, + _winograd_impl{}, + _data_layout(), + _winograd_transformed_input{}, + _winograd_transformed_output{}, + _winograd_transformed_weights{}, + _input_workspace(), + _output_workspace(), + _weights_hwio(), + _input_nhwc(), + _output_nhwc(), + _is_prepared{false}, + _run_activation{false} +{ +} + +CpuWinogradConv2d::~CpuWinogradConv2d() = default; + +void CpuWinogradConv2d::configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate(src, weights, biases, dst, conv_info, act_info, enable_fast_math)); + ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info, enable_fast_math); + ARM_COMPUTE_UNUSED(biases); + const DataType data_type = src->data_type(); + uint32_t nthreads = NEScheduler::get().num_threads(); + _data_layout = src->data_layout(); + const Tensor4DShape kernel_shape{internal_get_shape(weights)}; + + bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, + &_winograd_impl, _conv_args); + + ARM_COMPUTE_EXIT_ON_MSG_VAR(!success, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, + kernel_shape.n_cols); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", + _winograd_impl.input_transform->get_name().c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", + _winograd_impl.input_transform->get_name().c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", + _winograd_impl.input_transform->get_name().c_str()); + + const bool has_impl = ((_winograd_impl.input_transform != nullptr) && + (_winograd_impl.output_transform != nullptr) && (_winograd_impl.gemm_args != nullptr)); + if (has_impl) + { + // Determine how much working space is required, allocate it. + const size_t input_workspace_size = + _winograd_impl.input_transform->get_working_space_size(*_conv_args, nthreads); + const size_t output_workspace_size = + _winograd_impl.output_transform->get_working_space_size(*_conv_args, nthreads); + + TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, DataType::U8); + TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, DataType::U8); + _input_workspace = input_workspace_info; + _output_workspace = output_workspace_info; + + const auto &wds = _winograd_impl.winograd_spec; + + // Preparing winograd transformed input tensor + const size_t data_type_size = src->element_size(); + const uint32_t m = _winograd_impl.gemm_args->_Msize; // Total number of tiles + const uint32_t k = _winograd_impl.gemm_args->_Ksize; // Input channels + const uint32_t n = _winograd_impl.gemm_args->_Nsize; // Output channels + const uint32_t n_gemms = _winograd_impl.gemm_args->_nmulti; + const uint32_t n_batches = _winograd_impl.gemm_args->_nbatches; + constexpr size_t storage_alignment = 64; + + const TensorShape a_shape(k, m, n_batches, n_gemms); + Strides a_strides(data_type_size); + a_strides.set(1, data_type_size * _winograd_impl.winograd_spec.input_ld_row); + a_strides.set(2, data_type_size * _winograd_impl.winograd_spec.input_ld_batch); + a_strides.set(3, data_type_size * _winograd_impl.winograd_spec.input_ld_matrix); + + const TensorShape b_shape(n, k, n_gemms); + Strides b_strides(data_type_size); + b_strides.set(1, data_type_size * _winograd_impl.winograd_spec.weight_ld_row); + b_strides.set(2, data_type_size * _winograd_impl.winograd_spec.weight_ld_matrix); + + const TensorShape d_shape(n, m, n_batches, n_gemms); + Strides d_strides(data_type_size); + d_strides.set(1, data_type_size * _winograd_impl.winograd_spec.output_ld_row); + d_strides.set(2, data_type_size * _winograd_impl.winograd_spec.output_ld_batch); + d_strides.set(3, data_type_size * _winograd_impl.winograd_spec.output_ld_matrix); + + TensorInfo a_info{}; + TensorInfo b_info{}; + TensorInfo d_info{}; + a_info.init(a_shape, 1, data_type, a_strides, 0, wds.input_matrix_size_bytes); + b_info.init(b_shape, 1, data_type, b_strides, 0, wds.weight_matrix_size_bytes); + d_info.init(d_shape, 1, data_type, d_strides, 0, wds.output_matrix_size_bytes); + + _winograd_transformed_input = a_info; + _winograd_transformed_weights = b_info; + _winograd_transformed_output = d_info; + + PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U); + + // Configure the kernel to transform the input tensor from NCHW -> NHWC + if (_data_layout == DataLayout::NCHW) + { + _permute_input->configure(src, &_input_nhwc, PermutationVector(2U, 0U, 1U)); + weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U); + } + + // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map] + _permute_weights->configure(weights, &_weights_hwio, weights_permutation_vector); + + // Reorder the convoluted output to ACL's ordering NCHW + if (_data_layout == DataLayout::NCHW) + { + // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output() + TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0), dst->dimension(1), dst->dimension(3)), 1, + dst->data_type()); + _output_nhwc = info; + _permute_output->configure(&_output_nhwc, dst, PermutationVector(1U, 2U, 0U)); + } + + // Configure input transform kernel + _transform_input_kernel = + std::make_unique<CpuWinogradConv2dTransformInputKernel>(_winograd_impl, *_conv_args, nthreads); + + // Configure GEMM function + _gemm_function->configure(&_winograd_transformed_input, &_winograd_transformed_weights, nullptr, + &_winograd_transformed_output, 1.0f, 0.f); + + // Configure output transform kernel + _transform_output_kernel = + std::make_unique<CpuWinogradConv2dTransformOutputKernel>(_winograd_impl, *_conv_args, nthreads); + + //Configure Activation Layer + _run_activation = act_info.enabled() && !fuse_function_supported(act_info); + if (_run_activation) + { + _activation_func->configure(dst, nullptr, act_info); + } + + const auto mm_mem_req = _gemm_function->workspace(); + for (unsigned int slot = 0; slot < mm_mem_req.size(); ++slot) + { + _aux_mem[slot] = mm_mem_req[slot]; + } + + // Request temporary memory. Overlap memory needed for Input/Output transformations as they run on different non-overlapping time-steps. + _aux_mem[TransformedInput] = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary, + wds.input_matrix_size_bytes, storage_alignment); + _aux_mem[TransformedOutput] = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary, + wds.output_matrix_size_bytes, storage_alignment); + _aux_mem[WorkspaceIO] = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary, + std::max(input_workspace_size, output_workspace_size)); + _aux_mem[PermutedWeights] = + MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size()); + _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent, + wds.weight_matrix_size_bytes, storage_alignment); + if (_data_layout == DataLayout::NCHW) + { + _aux_mem[PermutedInput].merge(offset_int_vec(PermutedInput), src->total_size()); + _aux_mem[PermutedOutput].merge(offset_int_vec(PermutedOutput), dst->total_size()); + } + } +} +Status CpuWinogradConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info)); + + // Disable winograd for fp16 if fast math is false. + if (!enable_fast_math) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); + } + + const Tensor4DShape kernel_shape{internal_get_shape(weights)}; + arm_conv::winograd::WinogradImpl winograd_impl{}; + + std::unique_ptr<arm_conv::ConvolutionArgs> conv_args; + const bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, + &winograd_impl, conv_args); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(success == false, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, + kernel_shape.n_cols); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", + winograd_impl.input_transform->get_name().c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", + winograd_impl.input_transform->get_name().c_str()); + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", + winograd_impl.input_transform->get_name().c_str()); + return Status{}; +} + +void CpuWinogradConv2d::run(ITensorPack &tensors) +{ + prepare(tensors); + auto src = tensors.get_const_tensor(ACL_SRC_0); + auto biases = tensors.get_const_tensor(ACL_SRC_2); + auto output = tensors.get_tensor(ACL_DST); + Window win; + + const uint32_t nthreads = NEScheduler::get().num_threads(); + + // The Winograd transform implementation does fine-grain threading inside the transforms. Just pass thread_id and nthreads. + win.set(Window::DimX, Window::Dimension(0, nthreads, 1)); + + // Wrap the winograd-domain tensorInfos created in configuration in tensors and allocate the required memory. + CpuAuxTensorHandler input_nhwc(offset_int_vec(PermutedInput), _input_nhwc, tensors, true); + CpuAuxTensorHandler winograd_input_transformed(offset_int_vec(TransformedInput), _winograd_transformed_input, + tensors, true); + CpuAuxTensorHandler input_workspace(offset_int_vec(WorkspaceIO), _input_workspace, tensors, true); + const bool is_nchw = _data_layout == DataLayout::NCHW; + if (is_nchw) + { + //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC + ITensorPack pack{{ACL_SRC, src}, {ACL_DST, input_nhwc.get()}}; + _permute_input->run(pack); + } + + CpuAuxTensorHandler winograd_output_transformed(offset_int_vec(TransformedOutput), _winograd_transformed_output, + tensors, true); + CpuAuxTensorHandler output_workspace(offset_int_vec(WorkspaceIO), _output_workspace, tensors, true); + CpuAuxTensorHandler output_nhwc(offset_int_vec(PermutedOutput), _output_nhwc, tensors, true); + + ITensorPack transform_input_pack{{ACL_SRC, is_nchw ? input_nhwc.get() : src}, + {ACL_DST, winograd_input_transformed.get()}, + {ACL_INT, input_workspace.get()}}; + NEScheduler::get().schedule_op(_transform_input_kernel.get(), Window::DimX, win, transform_input_pack); + + CpuAuxTensorHandler winograd_weights_transformed(offset_int_vec(TransformedWeights), _winograd_transformed_weights, + tensors, true); + + // Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs + ITensorPack gemm_pack = tensors; + gemm_pack.add_const_tensor(ACL_SRC, winograd_input_transformed.get()); + gemm_pack.add_const_tensor(ACL_SRC_1, winograd_weights_transformed.get()); + gemm_pack.add_const_tensor(ACL_BIAS, nullptr); + gemm_pack.add_tensor(ACL_DST, winograd_output_transformed.get()); + _gemm_function->run(gemm_pack); + + // Output transform + ITensorPack transform_output_pack{{ACL_SRC_0, winograd_output_transformed.get()}, + {ACL_DST, is_nchw ? output_nhwc.get() : output}, + {ACL_SRC_1, biases}, + {ACL_INT, output_workspace.get()}}; + NEScheduler::get().schedule_op(_transform_output_kernel.get(), Window::DimX, win, transform_output_pack); + if (is_nchw) + { + // Reorder the convoluted output to ACL's ordering NCHW + ITensorPack pack{{ACL_SRC, output_nhwc.get()}, {ACL_DST, output}}; + _permute_output->run(pack); + } + if (_run_activation) + { + ITensorPack pack{{ACL_SRC, output}, {ACL_DST, output}}; + _activation_func->run(pack); + } +} + +void CpuWinogradConv2d::prepare(ITensorPack &tensors) +{ + if (!_is_prepared) + { + const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1); + ITensor *weights_aux = + utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights))); + + CpuAuxTensorHandler permuted_weights(_weights_hwio, *weights_aux); + ITensorPack permute_tensors{{ACL_SRC, weights}, {ACL_DST, permuted_weights.get()}}; + _permute_weights->run(permute_tensors); + const int element_size_in_bytes = permuted_weights.get()->info()->element_size(); + // Weights were in OHWI format, before being permuted "permuted_weights" to be in HWIO format. + const unsigned int height_idx = 3; // H in HWIO + const unsigned int width_idx = 2; // W in HWIO + const unsigned int channel_idx = 1; // I in HWIO + + const int permuted_weight_row_stride = + permuted_weights.get()->info()->strides_in_bytes()[height_idx] / element_size_in_bytes; + const int permuted_weight_col_stride = + permuted_weights.get()->info()->strides_in_bytes()[width_idx] / element_size_in_bytes; + const int permuted_weight_channel_stride = + permuted_weights.get()->info()->strides_in_bytes()[channel_idx] / element_size_in_bytes; + + // Wrap the winograd-domain transformed weight TensorInfo in Auxiliary tensor and allocate the required memory. + ITensor *weights_transf = + utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransformedWeights))); + ARM_COMPUTE_ERROR_ON_NULLPTR(weights_transf); + CpuAuxTensorHandler winograd_transformed_weights(_winograd_transformed_weights, *weights_transf); + + const void *permuted_weights_ptr; + void *win_wght_transf_ptr; + + permuted_weights_ptr = reinterpret_cast<const void *>( + permuted_weights.get()->buffer() + permuted_weights.get()->info()->offset_first_element_in_bytes()); + win_wght_transf_ptr = + reinterpret_cast<void *>(winograd_transformed_weights.get()->buffer() + + winograd_transformed_weights.get()->info()->offset_first_element_in_bytes()); + + // Prepare Weights + _winograd_impl.weight_transform->execute( + *_conv_args, permuted_weights_ptr, permuted_weight_row_stride, permuted_weight_col_stride, + permuted_weight_channel_stride, win_wght_transf_ptr, _winograd_impl.winograd_spec, 0, 1 // Thread 1 of 1 + ); + ITensorPack gemm_pack = tensors; + gemm_pack.add_const_tensor(ACL_SRC_1, winograd_transformed_weights.get()); + _gemm_function->prepare(gemm_pack); + _is_prepared = 1; + } +} +experimental::MemoryRequirements CpuWinogradConv2d::workspace() const +{ + return _aux_mem; +} + +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/CpuWinogradConv2d.h b/src/cpu/operators/CpuWinogradConv2d.h new file mode 100644 index 0000000000..03bfc51a46 --- /dev/null +++ b/src/cpu/operators/CpuWinogradConv2d.h @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_CPUWINOGRADCONV2D_H +#define ACL_SRC_CPU_OPERATORS_CPUWINOGRADCONV2D_H + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/runtime/FunctionDescriptors.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuOperator.h" +#include "src/cpu/kernels/assembly/gemm_common.hpp" +#include "src/cpu/kernels/CpuWinogradConv2dKernel.h" +#include "src/cpu/operators/CpuActivation.h" +#include "src/cpu/operators/CpuGemm.h" +#include "src/cpu/operators/CpuPermute.h" +#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h" + +namespace arm_compute +{ +namespace cpu +{ +class CpuWinogradConv2d : public ICpuOperator +{ +public: + /** Constructor */ + CpuWinogradConv2d(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuWinogradConv2d); + /** Destructor */ + ~CpuWinogradConv2d(); + + /** Set the input and output tensors. + * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:--------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * + * @param[in] src Source tensor Info. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: F16/F32. + * @param[in] weights Weights tensor Info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input. + * For supported kernel sizes, see @ref arm_compute::NEWinogradConvolutionLayer + * @param[in] biases Biases tensor Info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. + * @param[out] dst Destination tensor Info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. Currently only unit strides are supported. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + */ + void configure(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); + /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2d + * + * Similar to CpuWinogradConv2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; + experimental::MemoryRequirements workspace() const override; + +private: + enum AuxTensorIdx + { + /** Slot 0 - 6 reserved for CpuGemm */ + TransformedInput = 7, + TransformedOutput, + WorkspaceIO, + TransformedWeights, + PermutedWeights, + Count, + PermutedInput = TransformedOutput, + PermutedOutput = TransformedInput + }; + std::unique_ptr<CpuGemm> _gemm_function; + std::unique_ptr<CpuActivation> _activation_func; + std::unique_ptr<ICPPKernel> _transform_input_kernel; + std::unique_ptr<ICPPKernel> _transform_output_kernel; + std::unique_ptr<CpuPermute> _permute_input; + std::unique_ptr<CpuPermute> _permute_output; + std::unique_ptr<CpuPermute> _permute_weights; + experimental::MemoryRequirements _aux_mem{Count}; + std::unique_ptr<arm_conv::ConvolutionArgs> + _conv_args; // Make it unique ptr because this type does not have a default constructor + arm_conv::winograd::WinogradImpl _winograd_impl; + DataLayout _data_layout; + TensorInfo _winograd_transformed_input; + TensorInfo _winograd_transformed_output; + TensorInfo _winograd_transformed_weights; + TensorInfo _input_workspace; + TensorInfo _output_workspace; + TensorInfo _weights_hwio; + TensorInfo _input_nhwc; + TensorInfo _output_nhwc; + bool _is_prepared; + bool _run_activation; +}; +} // namespace cpu +} // namespace arm_compute + +#endif // ACL_SRC_CPU_OPERATORS_CPUWINOGRADCONV2D_H diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp new file mode 100644 index 0000000000..a4c856bb8f --- /dev/null +++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp @@ -0,0 +1,1140 @@ +/* + * Copyright (c) 2018-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h" + +#include "arm_compute/runtime/NEON/NEScheduler.h" + +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/utils/AssemblyUtils.h" +#include "src/cpu/kernels/assembly/arm_gemm.hpp" +#include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h" +#include "src/cpu/operators/CpuTranspose.h" +#include "src/cpu/utils/CpuAuxTensorHandler.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace cpu +{ +namespace +{ +/** Run pretranspose_B_array in parallel (1D static scheduling) + * + * @tparam TypeInput + * @tparam TypeOutput + * + * @param[in] gemm_asm GemmCommon kernel to run + * @param[in] dst Pretransposed B array + * @param[in] src B array to be pretransposed + * @param[in] src_ld Stride in y + * @param[in] src_multi_stride Stride in z ("multi") + * @param[in] num_threads Number of threads to run this method. Must be >= 1 + */ +template <typename TypeInput, typename TypeOutput> +void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutput> *gemm_asm, + ITensor *dst, + const TypeInput *src, + int src_ld, + int src_multi_stride, + unsigned int num_threads, + bool transpose) +{ + ARM_COMPUTE_ERROR_ON(gemm_asm == nullptr); + ARM_COMPUTE_ERROR_ON(num_threads == 0); + // The window size is also the total workload size + const unsigned int wsize = gemm_asm->get_B_pretranspose_window_size(); + + std::vector<IScheduler::Workload> workloads(num_threads); + for (unsigned int t = 0; t < num_threads; ++t) + { + workloads[t] = [=](const ThreadInfo &info) + { + const unsigned int start = (info.thread_id * wsize) / num_threads; + const unsigned int end = ((info.thread_id + 1) * wsize) / num_threads; + + if (start < end) + { + gemm_asm->pretranspose_B_array_part(dst->buffer(), src, src_ld, src_multi_stride, transpose, start, + end); + } + }; + } + NEScheduler::get().run_tagged_workloads(workloads, "CpuGemmAssemblyDispatch/pretranspose_B_array"); +} +} // namespace + +using namespace arm_compute::experimental; + +namespace +{ +struct free_delete +{ + void operator()(void *x) + { + free(x); + } +}; + +struct Params +{ + unsigned int M; + unsigned int N; + unsigned int K; + unsigned int batches; + unsigned int multis; + unsigned int sections; + bool indirect; +}; + +Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); + Params p; + p.M = d->tensor_shape().y(); + p.K = a->tensor_shape().x(); + p.N = d->tensor_shape().x(); + p.batches = 1; + p.multis = 1; + p.sections = 1; + p.indirect = false; + + if (info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect) + { + p.indirect = true; + p.sections = b->tensor_shape()[2] * b->tensor_shape()[3]; + } + else + { + p.multis = b->tensor_shape().z(); + p.batches = d->tensor_shape().total_size_upper(2) / p.multis; + } + + // Update M in case of GEMM3D for output + if (info.depth_output_gemm3d != 0) + { + p.M = d->tensor_shape().y() * d->tensor_shape().z(); + p.batches = d->tensor_shape().total_size_upper(3) / p.multis; + } + + return p; +} + +IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataType data_type) +{ + // Schedule assembly kernel + const int granule_threshold = 200; + IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX); + if (method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32) + { + scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold); + } + else if (method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && + (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || + data_type == DataType::S8)) + { + //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions + scheduling_hint = + IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); + } + else if (method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && + (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED)) + { + //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case + scheduling_hint = + IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold); + } + + return scheduling_hint; +} + +/** Fallback in case ACL doesn't have a function */ +template <typename TypeInput, typename TypeOutput, class OutputStage = arm_gemm::Nothing> +class Fallback : public CpuGemmAssemblyDispatch::IFallback +{ +public: + /** Destructor */ + ~Fallback() = default; + + /** Initialise the functions's input and output. + * + * @param[in] a Input tensor containing the Matrix A. + * @param[in] b Input tensor containing the Matrix B. + * @param[in] c Input tensor containing the Matrix C. + * @param[out] d Output tensor to store the result of matrix multiplication. + * @param[in] args Matrix multiplication information. + * @param[in] gemm_info GEMM meta-data + * @param[in] os Output stage meta-data. + */ + void configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::GemmArgs args, + const AsmGemmInfo &gemm_info, + const OutputStage &os = {}); + + /** Set requantization shifts to be used + * + * @param[in] shifts Requantization shifts + * + * @return Pointer to the shift data + */ + /** Set requantization data to be used + * + * + * @param shifts Requantization shifts + * @param multipliers Requantization multipliers + * + * @return A tuple with the pointers to the shift and multiplier data respectively + */ + std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> + set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + bool is_configured() const override; + experimental::MemoryRequirements workspace() const override; + bool isVarWeightsKernel() const override + { + if (!_gemm_kernel_asm) + return false; + const arm_compute::WeightFormat wf = + assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format); + return wf != arm_compute::WeightFormat::UNSPECIFIED && wf != arm_compute::WeightFormat::ANY; + } + +private: + enum AuxTensorIdx + { + AsmGemmWorkspace = 0, + PrePretransposedB, /* Transposed B (rhs) before being passed to gemm or pretranspose_B_array */ + Pretranspose, + Count + }; + + /** Configure the indirect buffer + * + * @param[in] a Input tensor containing the Matrix A. + * @param[in] b Input tensor containing the Matrix B. + * @param[out] d Output tensor to store the result of matrix multiplication. + * @param[in] info GEMM meta-data + */ + void configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info); + /** Prepare the indirect buffer */ + void prepare_indirect_buffer(ITensorPack &tensors); + + /** Operator to transpose B before gemm or pretranspose_B_array*/ + std::unique_ptr<CpuTranspose> _pre_pretranspose_b{nullptr}; + /** Assembly Gemm kernel */ + std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{nullptr}; + /** Optimised Arm® Neon™ kernel */ + std::unique_ptr<INEKernel> _optimised_kernel{nullptr}; + /** Assembly GEMM workspace tensor info */ + TensorInfo _workspace_info{}; + /** Pre-pre-transposed B tensor info */ + TensorInfo _pre_pretransposed_b_info{}; + /** Pre-transpose tensor info */ + TensorInfo _pretranspose_info{}; + /** Prepared flag */ + bool _is_prepared{false}; + /** GEMM meta-data */ + AsmGemmInfo _gemm_info{}; + /** GEMM kernel description */ + arm_gemm::KernelDescription _kernel_info{}; + /** Per channel quantization shifts */ + std::vector<int32_t> _shifts{}; + std::vector<int32_t> right_shifts{}; + std::vector<int32_t> left_shifts{}; + /** Per channel quantization multipliers */ + std::vector<int32_t> _multipliers{}; + /** Indirect buffer */ + std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{}; + std::unique_ptr<const TypeInput *, free_delete> _indirect_buf{}; + std::vector<TypeInput> _indirect_pad{}; + arm_gemm::ConvolutionParameters _cp{}; + experimental::MemoryRequirements _aux_mem{Count}; + bool _B_pretranspose_required{false}; + bool _is_b_constant{true}; + bool _is_c_constant{true}; + bool _run_pre_pretranspose_b{false}; + bool _B_pre_pretranspose_required{false}; +}; + +template <typename TypeInput, typename TypeOutput, class OutputStage> +std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> +Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, + const std::vector<int32_t> &multipliers) +{ + _multipliers = multipliers; + _shifts = shifts; + bool need_left = false; + for (const auto s : _shifts) + { + left_shifts.push_back(std::max(-s, int32_t(0))); + right_shifts.push_back(std::min(-s, int32_t(0))); + if (s < 0 && !need_left) + { + need_left = true; + } + } + return std::make_tuple(need_left, left_shifts.data(), right_shifts.data(), _multipliers.data()); +} + +template <typename TypeInput, typename TypeOutput, class OutputStage> +void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITensorPack &tensors) +{ + auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const TypeInput *A_ptr = reinterpret_cast<TypeInput *>(a->buffer()); + const int multis = 1; + const int batches = a->info()->tensor_shape().total_size_upper(3); + const size_t stride_A = a->info()->strides_in_bytes().y() / sizeof(TypeInput); + const size_t batch_stride_A = a->info()->strides_in_bytes()[3] / sizeof(TypeInput); + const size_t multi_stride_A = a->info()->strides_in_bytes()[4] / sizeof(TypeInput); + + const size_t output_hw = _cp.output_height * _cp.output_width; + const int batch_size = _cp.kernel_height * _cp.kernel_width * output_hw * sizeof(TypeInput); + const size_t batch_stride = batch_size / sizeof(TypeInput); + const int multi_size = batch_size * batches; + const size_t multi_stride = multi_size / sizeof(TypeInput); + + for (int64_t m = 0; m < multis; m++) + { + for (int64_t b = 0; b < batches; b++) + { + for (int64_t output_y = 0; output_y < _cp.output_height; output_y++) + { + for (int64_t output_x = 0; output_x < _cp.output_width; output_x++) + { + int64_t output_xy = (output_y * _cp.output_width) + output_x; + + for (int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++) + { + for (int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++) + { + int64_t input_x = (output_x * _cp.output_stride_w) + kernel_x - _cp.padding_left; + int64_t input_y = (output_y * _cp.output_stride_h) + kernel_y - _cp.padding_top; + int64_t kernel_xy = (kernel_y * _cp.kernel_width) + kernel_x; + int64_t input_xy = (input_y * _cp.input_width) + input_x; + + if (input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height) + { + _indirect_buf + .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = + _indirect_pad.data(); + } + else + { + _indirect_buf + .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = + A_ptr + (m * multi_stride_A + b * batch_stride_A + input_xy * stride_A); + } + } + } + } + } + } + } +} + +template <typename TypeInput, typename TypeOutput, class OutputStage> +void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *d, + const AsmGemmInfo &info) +{ + ARM_COMPUTE_ERROR_ON(!(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)); + + float zeropad = 0.f; + if (is_data_type_quantized(a->data_type())) + { + zeropad = a->quantization_info().uniform().offset; + } + + const int64_t input_width = static_cast<int64_t>(a->tensor_shape()[1]); + const int64_t input_height = static_cast<int64_t>(a->tensor_shape()[2]); + const int64_t input_channels = static_cast<int64_t>(a->tensor_shape()[0]); + const int64_t kernel_width = static_cast<int64_t>(b->tensor_shape()[2]); + const int64_t kernel_height = static_cast<int64_t>(b->tensor_shape()[3]); + const int64_t output_width = static_cast<int64_t>(d->tensor_shape()[1]); + const int64_t output_height = static_cast<int64_t>(d->tensor_shape()[2]); + + _cp = {input_width, + input_height, + input_channels, + kernel_width, + kernel_height, + output_width, + output_height, + info.ps_info.stride().first, + info.ps_info.stride().second, + info.padding_top, + info.padding_left, + zeropad}; + + if (info.method == AsmConvMethod::Conv) + { + _gemm_kernel_asm->set_convolution_parameters(_cp); + } + + if (info.method == AsmConvMethod::Indirect) + { + const unsigned int multis = 1; + const unsigned int batches = a->tensor_shape().total_size_upper(3); + const unsigned int kernel_hw = _cp.kernel_width * _cp.kernel_height; + const unsigned int output_hw = _cp.output_width * _cp.output_height; + + using TypeInputPtr = TypeInput *; + const int batch_size = kernel_hw * output_hw * sizeof(TypeInputPtr); + const size_t batch_stride = batch_size / sizeof(TypeInputPtr); + const int multi_size = batch_size * batches; + const size_t multi_stride = multi_size / sizeof(TypeInputPtr); + + _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>( + reinterpret_cast<const TypeInput **>(malloc(multi_size * multis))); + _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>( + reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches))); + _indirect_pad = std::vector<TypeInput>(_cp.input_channels, TypeInput(zeropad)); + + // Set indirect argument + int64_t pos = 0; + for (int64_t m = 0; m < multis; m++) + { + for (int64_t b = 0; b < batches; b++) + { + for (int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++) + { + (_indirect_arg.get())[pos++] = + _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw; + } + } + } + + _gemm_kernel_asm->set_indirect_parameters(a->tensor_shape()[0], _indirect_arg.get()); + } +} + +template <typename TypeInput, typename TypeOutput, class OutputStage> +void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::GemmArgs args, + const AsmGemmInfo &gemm_info, + const OutputStage &os) +{ + _is_b_constant = b->are_values_constant(); + _is_c_constant = c ? c->are_values_constant() : true; + + _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(args, os); + if (_gemm_kernel_asm == nullptr) + { + //configuration not supported: Leave function unconfigured: + return; + } + + arm_gemm::GemmConfig gemm_cfg = _gemm_kernel_asm->get_config(); + + // arm_compute wrapper for the Gemm object (see above) + auto acl_gemm_wrapper = std::make_unique<kernel::CpuGemmAssemblyWrapperKernel<TypeInput, TypeOutput>>(); + ARM_COMPUTE_ERROR_ON(acl_gemm_wrapper == nullptr); + acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter); + const size_t workspace_size = _gemm_kernel_asm->get_working_size(); + const unsigned int alignment = 4096; + _workspace_info = TensorInfo(TensorShape(workspace_size), 1, DataType::U8); + _aux_mem[AsmGemmWorkspace] = + MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment); + + //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and + //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001 + { + const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size(); + if (window_size < static_cast<unsigned int>(args._maxthreads)) + { + _gemm_kernel_asm->set_nthreads(window_size); + } + } + + _optimised_kernel = std::move(acl_gemm_wrapper); + _gemm_info = gemm_info; + + // Check if we need to pre-pretranspose B. Fixed format kernels need no pre-pretranspose. + _B_pre_pretranspose_required = _gemm_info.transpose_b && !isVarWeightsKernel(); + _B_pretranspose_required = _gemm_kernel_asm->B_pretranspose_required(); + + const bool kernel_supports_transpose = _gemm_kernel_asm->B_pretranspose_supports_transpose(); + const bool kernel_can_fuse_transpose = _B_pretranspose_required && kernel_supports_transpose; + _run_pre_pretranspose_b = _B_pre_pretranspose_required && !kernel_can_fuse_transpose; + + if (_run_pre_pretranspose_b) + { + _pre_pretranspose_b = std::make_unique<CpuTranspose>(); + _pre_pretranspose_b->configure(b, &_pre_pretransposed_b_info); + MemoryLifetime lifetime; + if (_is_b_constant) + { + if (_B_pretranspose_required) + { + // PrePretransposedB tensor is only used in prepare(), but is then succeeded by Pretranspose + // So PrePretransposedB can be freed inside prepare() + lifetime = MemoryLifetime::Prepare; + } + else + { + // PrePretransposedB tensor is only used in prepare(), but is the final transformation of B + // So PrePretransposedB needs to persist beyond prepare() + lifetime = MemoryLifetime::Persistent; + } + } + else + { + // PrePretransposedB tensor is always used in run() and doesn't need to persist + lifetime = MemoryLifetime::Temporary; + } + // Forcing 128-byte alignment (required by 32-bit kernels) + const unsigned int alignment = 128; + _aux_mem[PrePretransposedB] = + MemoryInfo(offset_int_vec(PrePretransposedB), lifetime, _pre_pretransposed_b_info.total_size(), alignment); + } + + // Check for pre-transposed support + if (_B_pretranspose_required) + { + // Fixed format kernels need no pretranspose. + ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format( + assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format))); + // Forcing 128-byte alignment (required by 32-bit kernels) + const unsigned int alignment = 128; + const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size(); + _pretranspose_info = TensorInfo(TensorShape(B_pretranspose_size), 1, DataType::U8); + _aux_mem[Pretranspose] = + MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment); + } + + // Handle indirect GEMM convolution + if (gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect) + { + configure_indirect(a, b, d, gemm_info); + } + + if (std::is_same<OutputStage, arm_gemm::DequantizeFloat>::value) + { + // Output dequantization is just the two src scales multiplied together + _gemm_kernel_asm->set_dequantize_scale(a->quantization_info().uniform().scale * + b->quantization_info().uniform().scale); + } +} + +template <typename TypeInput, typename TypeOutput, class OutputStage> +void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors) +{ + if (!_is_prepared) + { + auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); + ARM_COMPUTE_ERROR_ON_NULLPTR(b); + + // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C. + if (c && c->info()->data_type() == DataType::S32) + { + _gemm_kernel_asm->set_quantized_bias( + reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0); + } + const ITensor *b_to_use = b; + + // Pre-pretranspose B if required + CpuAuxTensorHandler pre_pretransposed_b( + offset_int_vec(PrePretransposedB), _pre_pretransposed_b_info, tensors, + /*pack_inject: no need to inject into tensors*/ + false, + /*bypass_alloc: no need to allocate if pre-pretranspose B is not required as this handle will not be used*/ + !_run_pre_pretranspose_b); + + if (_run_pre_pretranspose_b) + { + ARM_COMPUTE_ERROR_ON(_pre_pretranspose_b == nullptr); + ITensorPack pre_pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pre_pretransposed_b.get()}}; + _pre_pretranspose_b->run(pre_pretranspose_pack); + b_to_use = pre_pretransposed_b.get(); + } + + // Pretranspose B if required + if (_B_pretranspose_required) + { + // Fixed format kernels need no pretranspose. + ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format( + assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format))); + const int ldb = b_to_use->info()->strides_in_bytes().y() / b_to_use->info()->element_size(); + const auto in1_ptr = reinterpret_cast<const TypeInput *>(b_to_use->buffer() + + b_to_use->info()->offset_first_element_in_bytes()); + const int multi_stride_b = b_to_use->info()->strides_in_bytes().z() / b_to_use->info()->element_size(); + + CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false); + + ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr); + + const bool kernel_supports_transpose = _gemm_kernel_asm->B_pretranspose_supports_transpose(); + run_parallel_pretranspose_B_array<TypeInput, TypeOutput>( + _gemm_kernel_asm.get(), pretranspose.get(), in1_ptr, ldb, multi_stride_b, + NEScheduler::get().num_threads(), _B_pre_pretranspose_required && kernel_supports_transpose); + + b->mark_as_unused(); + // Note that we don't need to mark b_to_use as unused, as if it's been assigned to pre_pretransposed_b, + // its memory will be auto-managed by the handler + } + + if (_gemm_info.method == AsmConvMethod::Indirect) + { + prepare_indirect_buffer(tensors); + } + + _is_prepared = true; + } +} + +template <typename TypeInput, typename TypeOutput, class OutputStage> +bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured() const +{ + return _optimised_kernel != nullptr; +} + +template <typename TypeInput, typename TypeOutput, class OutputStage> +experimental::MemoryRequirements Fallback<TypeInput, TypeOutput, OutputStage>::workspace() const +{ + return _aux_mem; +} + +template <typename TypeInput, typename TypeOutput, class OutputStage> +void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) +{ + auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0); + auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); + auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2); + auto d = tensors.get_tensor(TensorType::ACL_DST); + ARM_COMPUTE_ERROR_ON_NULLPTR(a, d); + + // Only update at runtime if the src quantization is dynamic + if (std::is_same<OutputStage, arm_gemm::DequantizeFloat>::value && + (a->info()->quantization_info().is_dynamic() || b->info()->quantization_info().is_dynamic())) + { + // Output dequantization is just the two src scales multiplied together + _gemm_kernel_asm->set_dequantize_scale(a->info()->quantization_info().uniform().scale * + b->info()->quantization_info().uniform().scale); + } + + int lda = a->info()->strides_in_bytes().y() / a->info()->element_size(); + int ldb = 0; + const int ldd = d->info()->strides_in_bytes().y() / d->info()->element_size(); + + const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d != 0 ? 3 : 2; + const size_t a_multi_idx = a_batch_idx + 1; + const size_t d_batch_idx = _gemm_info.depth_output_gemm3d != 0 ? 3 : 2; + const size_t d_multi_idx = d_batch_idx + 1; + + int batch_stride_a = a->info()->strides_in_bytes()[a_batch_idx] / a->info()->element_size(); + const int batch_stride_d = d->info()->strides_in_bytes()[d_batch_idx] / d->info()->element_size(); + + int multi_stride_a = a->info()->strides_in_bytes()[a_multi_idx] / a->info()->element_size(); + int multi_stride_b = 0; + const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] / d->info()->element_size(); + + auto in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes()); + const TypeInput *in1_ptr = nullptr; + auto out_ptr = reinterpret_cast<TypeOutput *>(d->buffer() + d->info()->offset_first_element_in_bytes()); + + const ITensor *b_to_use = b; + + // Pre-pretranspose B if required + CpuAuxTensorHandler pre_pretransposed_b( + offset_int_vec(PrePretransposedB), _pre_pretransposed_b_info, tensors, + false /*pack_inject: no need to inject into tensors*/, + !_run_pre_pretranspose_b /*bypass_alloc: no need to allocate if pre-pretranspose B is not required as this handle will not be used*/); + if (b_to_use && !_is_b_constant && _run_pre_pretranspose_b) + { + ARM_COMPUTE_ERROR_ON(_pre_pretranspose_b == nullptr); + ITensorPack pre_pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pre_pretransposed_b.get()}}; + _pre_pretranspose_b->run(pre_pretranspose_pack); + b_to_use = pre_pretransposed_b.get(); + } + + // Check if B is pre-tranposed and de-reference if not + if (b_to_use && !_gemm_kernel_asm->B_is_pretransposed()) + { + ldb = b_to_use->info()->strides_in_bytes().y() / b_to_use->info()->element_size(); + multi_stride_b = b_to_use->info()->strides_in_bytes().z() / b_to_use->info()->element_size(); + in1_ptr = + reinterpret_cast<const TypeInput *>(b_to_use->buffer() + b_to_use->info()->offset_first_element_in_bytes()); + } + + // If necessary, run pretranspose every time if either weights or biases are non-constant + if ((b_to_use && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() == DataType::S32)) + { + if (c && c->info()->data_type() == DataType::S32) + { + _gemm_kernel_asm->set_quantized_bias( + reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0); + } + + // Pretranspose B if required + if (b_to_use && _B_pretranspose_required) + { + // Fixed format kernels need no pretranspose. + ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format( + assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format))); + const int ldb = b_to_use->info()->strides_in_bytes().y() / b_to_use->info()->element_size(); + const auto b_ptr = reinterpret_cast<const TypeInput *>(b_to_use->buffer() + + b_to_use->info()->offset_first_element_in_bytes()); + const int multi_stride_b = b_to_use->info()->strides_in_bytes().z() / b_to_use->info()->element_size(); + + CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, true); + ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr); + + if (_is_b_constant) + { + _gemm_kernel_asm->requantize_bias(pretranspose.get()->buffer(), b_ptr, ldb, multi_stride_b); + } + else + { + const bool kernel_supports_transpose = _gemm_kernel_asm->B_pretranspose_supports_transpose(); + run_parallel_pretranspose_B_array<TypeInput, TypeOutput>( + _gemm_kernel_asm.get(), pretranspose.get(), b_ptr, ldb, multi_stride_b, + NEScheduler::get().num_threads(), _B_pre_pretranspose_required && kernel_supports_transpose); + } + } + } + + const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.method, d->info()->data_type()); + + // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads + CpuAuxTensorHandler workspace(offset_int_vec(AsmGemmWorkspace), _workspace_info, tensors, false); + if (workspace.get()->buffer() != nullptr) + { + _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(workspace.get()->buffer())); + const unsigned int split_dim = scheduling_hint.split_dimension(); + const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size(); + unsigned int num_threads = NEScheduler::get().num_threads(); + if (window_size < num_threads) + { + num_threads = window_size; + } + if (split_dim != IScheduler::split_dimensions_all) + { + // Make sure the kernel does not expect more threads than we can actually spawn + const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim); + num_threads = std::min(num_iterations, num_threads); + } + _gemm_kernel_asm->set_nthreads(num_threads); + } + + // Prepare assembly kernel + prepare(tensors); + + // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C. + TypeOutput *bias = nullptr; + if (c && c->info()->data_type() != DataType::S32) + { + bias = reinterpret_cast<TypeOutput *>(c->buffer() + c->info()->offset_first_element_in_bytes()); + } + + if (_gemm_info.method == AsmConvMethod::Indirect) + { + in0_ptr = nullptr; + lda = 0; + batch_stride_a = 0; + multi_stride_a = 0; + } + + // Set gemm parameters + _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr, + ldd, batch_stride_d, multi_stride_d, bias, 0); + // Schedule + NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint); +} + +template <typename TypeInput, typename TypeOutput> +void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::Activation activation, + const AsmGemmInfo &info) +{ + Params p = extract_parameters(a, b, d, info); + const CPUInfo &ci = NEScheduler::get().cpu_info(); + unsigned int num_threads = NEScheduler::get().num_threads(); + + arm_gemm::GemmConfig cfg; + cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, + info.fixed_format, info.fast_mode, info.accumulate, &cfg); + + // Create arm_gemm fallback + auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>(); + fallback->configure(a, b, c, d, args, info); + arm_gemm = std::move(fallback); +} + +template <typename TypeInput, typename TypeOutput> +void create_arm_gemm_dequant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::Activation activation, + const AsmGemmInfo &info) +{ + ARM_COMPUTE_UNUSED(activation); + + Params p = extract_parameters(a, b, d, info); + const CPUInfo &ci = NEScheduler::get().cpu_info(); + const unsigned int num_threads = NEScheduler::get().num_threads(); + + arm_gemm::GemmConfig cfg; + cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, + info.fixed_format, info.fast_mode, info.accumulate, &cfg); + + // Create arm_gemm fallback + auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::DequantizeFloat>>(); + + // Configure requantization info + const GEMMLowpOutputStageInfo os_info = info.output_stage; + + arm_gemm::DequantizeFloat gemm_dequant_info{}; + gemm_dequant_info = arm_gemm::DequantizeFloat(d->quantization_info().uniform().scale); + + fallback->configure(a, b, c, d, args, info, gemm_dequant_info); + arm_gemm = std::move(fallback); +} + +template <typename TypeInput, typename TypeOutput> +void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::Activation activation, + const AsmGemmInfo &info) +{ + ARM_COMPUTE_UNUSED(activation); + Params p = extract_parameters(a, b, d, info); + const CPUInfo &ci = NEScheduler::get().cpu_info(); + const unsigned int num_threads = NEScheduler::get().num_threads(); + + arm_gemm::GemmConfig cfg; + cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, + info.fixed_format, info.fast_mode, info.accumulate, &cfg); + + // Create arm_gemm fallback + auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>(); + + // Configure requantization info + const int32_t negation = info.negated_offsets ? 1 : -1; + const int32_t a_offset = -a->quantization_info().uniform().offset * negation; + const int32_t b_offset = -b->quantization_info().uniform().offset * negation; + const GEMMLowpOutputStageInfo os_info = info.output_stage; + + arm_gemm::Requantize32 gemm_requant_info{}; + if (os_info.gemmlowp_shifts.size() > 1) + { + const auto requantize_data = + fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers); + gemm_requant_info = arm_gemm::Requantize32( + nullptr, 0, a_offset, b_offset, os_info.gemmlowp_offset, + (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr, std::get<2>(requantize_data), + std::get<3>(requantize_data), os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound); + } + else + { + gemm_requant_info = + arm_gemm::Requantize32(nullptr, 0, a_offset, b_offset, os_info.gemmlowp_offset, -os_info.gemmlowp_shift, + os_info.gemmlowp_multiplier, os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound); + } + + // Configure fallback + fallback->configure(a, b, c, d, args, info, gemm_requant_info); + arm_gemm = std::move(fallback); +} +} //namespace + +CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch() : _arm_gemm(nullptr) +{ +} + +Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const AsmGemmInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); + ARM_COMPUTE_UNUSED(c); + arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info); + Params p = extract_parameters(a, b, d, info); + const CPUInfo &ci = NEScheduler::get().cpu_info(); + unsigned int num_threads = NEScheduler::get().num_threads(); + arm_gemm::GemmConfig cfg; + cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); + arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format); + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads, + info.fixed_format, info.fast_mode, info.accumulate, &cfg); + // TODO: Incorporate info.transpose_b COMPMID-6595 + switch (a->data_type()) + { + case DataType::F32: + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<float, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for F32 input"); + break; +#ifdef __aarch64__ + case DataType::U8: + case DataType::QASYMM8: + if (d->data_type() == DataType::S32) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<uint8_t, uint32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for U8/QASYMM8 input and U32 output"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<uint8_t, uint8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for U8 input and U8 output"); + } + break; + case DataType::S8: + case DataType::QASYMM8_SIGNED: + if (d->data_type() == DataType::S32) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<int8_t, int32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<int8_t, int8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for S8 input and S8 output"); + } + break; +#endif /* __aarch64__ */ + +#if defined(ARM_COMPUTE_ENABLE_BF16) + case DataType::BFLOAT16: + { + if (d->data_type() == DataType::BFLOAT16) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<bfloat16, bfloat16, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for BFLOAT16 input and BFLOAT16 output"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<bfloat16, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for BFLOAT16 input and F32 output"); + } + break; + } +#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ + +#if defined(ENABLE_FP16_KERNELS) + case DataType::F16: + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !(arm_gemm::has_opt_gemm<float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), + "We could not find an optimized kernel for F16 input and F16 output"); + break; +#endif /* ENABLE_FP16_KERNELS */ + default: + ARM_COMPUTE_RETURN_ERROR_ON_MSG(true, "Usupported type. Could not find a kernel"); + break; + } + expected_weight_format = assembly_utils::map_to_arm_compute_weight_format(arm_gemm_expected_wf); + + return Status{}; +} + +Status CpuGemmAssemblyDispatch::validate( + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info) +{ + ARM_COMPUTE_UNUSED(c, info); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(info.reshape_b_only_on_first_run), + "Assembly kernel will not be executed when reshape_b_only_on_first_run is false"); + +#ifndef __aarch64__ + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64"); +#endif /* __aarch64__ */ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S8, DataType::BFLOAT16, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8, + DataType::BFLOAT16, DataType::F16, DataType::F32); + if (is_data_type_quantized_per_channel(b->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8); + } + else if (is_fixed_format_fast_math(info.weight_format)) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); + } + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, + "Only F32 output supported for F32 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, + "Only F16 output supported for F16 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && + (d->data_type() != DataType::F32 && d->data_type() != DataType::BFLOAT16), + "Only F32/BFLOAT16 output supported for BFLOAT16 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, + "Only U32 output supported for U8 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, + "Only S32 output supported for S8 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && + (d->data_type() != DataType::QASYMM8 && d->data_type() != DataType::S32), + "Only QASYMM8/S32 output supported for QASYMM8 input"); + arm_compute::WeightFormat expected_weight_format = arm_compute::WeightFormat::UNSPECIFIED; + const Status ret = CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, a, b, c, d, info); + if ((bool)ret && expected_weight_format != arm_compute::WeightFormat::ANY) + { + // Correctness check: if the format expected by the kernel is + // not "any", make sure that the one found matches the format + // intended by the caller. + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (expected_weight_format != info.weight_format), + "The format expected by the kernel does not correspond with the one requested by the user."); + } + return ret; +} + +bool CpuGemmAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &activation) +{ + arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(activation); + return act.type != arm_gemm::Activation::Type::None; +} + +void CpuGemmAssemblyDispatch::configure( + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d); + arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info); + + //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured() + if (!CpuGemmAssemblyDispatch::validate(a, b, c, d, info)) + { + return; + } + + switch (a->data_type()) + { + case DataType::F32: + create_arm_gemm<float, float>(_arm_gemm, a, b, c, d, act, info); + break; +#ifdef __aarch64__ + case DataType::U8: + case DataType::QASYMM8: + if (d->data_type() == DataType::S32) + { + create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, a, b, c, d, act, info); + } + else + { + create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, a, b, c, d, act, info); + } + break; + case DataType::S8: + case DataType::QASYMM8_SIGNED: + if (d->data_type() == DataType::S32) + { + create_arm_gemm<int8_t, int32_t>(_arm_gemm, a, b, c, d, act, info); + } + else if (d->data_type() == DataType::F32) + { + create_arm_gemm_dequant<int8_t, float>(_arm_gemm, a, b, c, d, act, info); + } + else + { + create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, a, b, c, d, act, info); + } + break; +#endif /* __aarch64__ */ +#if defined(ARM_COMPUTE_ENABLE_BF16) + case DataType::BFLOAT16: + if (d->data_type() == DataType::BFLOAT16) + { + create_arm_gemm<bfloat16, bfloat16>(_arm_gemm, a, b, c, d, act, info); + } + else + { + create_arm_gemm<bfloat16, float>(_arm_gemm, a, b, c, d, act, info); + } + break; +#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ +#ifdef ENABLE_FP16_KERNELS + case DataType::F16: + create_arm_gemm<float16_t, float16_t>(_arm_gemm, a, b, c, d, act, info); + break; +#endif /* ENABLE_FP16_KERNELS */ + default: + break; + } +} + +void CpuGemmAssemblyDispatch::prepare(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); + _arm_gemm->prepare(tensors); +} + +bool CpuGemmAssemblyDispatch::is_configured() const +{ + return _arm_gemm && _arm_gemm->is_configured(); +} + +void CpuGemmAssemblyDispatch::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); + _arm_gemm->run(tensors); +} + +experimental::MemoryRequirements CpuGemmAssemblyDispatch::workspace() const +{ + ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); + return _arm_gemm->workspace(); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h new file mode 100644 index 0000000000..44c5c189a5 --- /dev/null +++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2018-2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_OPERATORS_INTERNAL_CPUGEMMASSEMBLYDISPATCH_H +#define ACL_SRC_CPU_OPERATORS_INTERNAL_CPUGEMMASSEMBLYDISPATCH_H + +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/core/common/Macros.h" +#include "src/cpu/ICpuOperator.h" + +namespace arm_compute +{ +namespace cpu +{ +/* Convolution method supported by the assembly gemm interface */ +enum class AsmConvMethod +{ + Im2Col, + Indirect, + Conv +}; + +struct AsmGemmInfo +{ + AsmConvMethod method{AsmConvMethod::Im2Col}; + PadStrideInfo ps_info{}; + ActivationLayerInfo activation_info{}; + GEMMLowpOutputStageInfo output_stage{}; + bool negated_offsets{true}; + bool reinterpret_input_as_3d{false}; + bool depth_output_gemm3d{false}; + int64_t padding_top{0}; + int64_t padding_left{0}; + float padding_value{0.f}; + bool fast_mode{false}; + bool fixed_format{false}; + arm_compute::WeightFormat weight_format{arm_compute::WeightFormat::UNSPECIFIED}; + bool reshape_b_only_on_first_run{true}; + bool accumulate{false}; + /** Whether we want to perform an additional transpose of b before passing it to gemm or pretranspose_B_array + * @note This transpose b operation is also considered a form of "reshape" or "transform", so should be counted for + * by the reshape_b_only_on_first_run flag + * @note This flag will be silently ignored (assumed to be false) when the weight_format is a fixed format. Because + * fixed format kernels do not accept weights (B) with any prior transformations + */ + bool transpose_b{false}; +}; + +/** Assembly kernel glue */ +class CpuGemmAssemblyDispatch : public ICpuOperator +{ +public: + /** Constructor */ + CpuGemmAssemblyDispatch(); + /** Defautl destructor */ + ~CpuGemmAssemblyDispatch() = default; + + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmAssemblyDispatch); + + class IFallback + { + public: + virtual void run(ITensorPack &tensors) = 0; + virtual void prepare(ITensorPack &tensors) = 0; + virtual experimental::MemoryRequirements workspace() const = 0; + virtual bool is_configured() const = 0; + virtual bool isVarWeightsKernel() const = 0; + virtual ~IFallback() = default; + }; + +public: + /** If supported create a Compute Library function else fallback to the arm_gemm function. + * + * @note Configuring "batches" + * The shapes of @p a @p b and @p d are arranged as follows: + * Lowest dimension <-> Highest dimension + * a: [K, M, Batch, Multi] + * b: [N, K, Multi] + * d: [N, M, Batch, Multi] + * + * The "Batch" refers to where "Batch" number of MxK slices of tensor a multiplies with a single KxN slice of b + * The "Multi" refers to where "Multi" number of individual multiplication of a with b + * + * E.g. the following are some example input shape configurations + * + * (1) Normal 2D gemm + * a: [K=3, M=4] + * b: [N=5, K=3] + * d: [N=5, M=4] + * + * (2) Batches of a sharing b (e.g. gemm-based batched convolution where b is the shared ) + * a: [K=3, M=4, Batch=9] + * b: [N=5, K=3] + * d: [N=5, M=4, Batch=9] + * + * (3) "Batches" of independent gemm (e.g. batched matmul) + * a: [K=3, M=4, Batch=1, Multi=7] + * b: [N=5, K=3, Multi=7] + * d: [N=5, M=4, Batch=1, Multi=7] + * + * (4) "Batches" of independent gemm where b is also shared + * a: [K=3, M=4, Batch=4, Multi=7] + * b: [N=5, K=3, Multi=7] + * d: [N=5, M=4, Batch=4, Multi=7] + * + * @param[in] a Input tensor (Matrix A) + * @param[in] b Input tensor (Matrix B) + * @param[in] c Input tensor (Matrix C) used to pass the bias for quantized calculations + * @param[out] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. + * @param[in] info GEMM meta-data + */ + void configure( + const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info); + + /** Indicates whether or not this function can be used to process the given parameters. + * + * @param[in] a Input tensor info (Matrix A) + * @param[in] b Input tensor info (Matrix B) + * @param[in] c Input tensor info (Matrix C) used to pass the bias for quantized calculations + * @param[in] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. + * @param[in] info GEMM meta-data + * + * @return a status. + */ + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const AsmGemmInfo &info); + + /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters. + * + * This method has the same use of @ref + * NEGEMMConvolutionLayer::has_opt_impl, with the only caveat that + * the value of arm_compute::WeightFormat need to be passed via the + * parameter info. + * + * @return a status. + */ + static Status has_opt_impl(arm_compute::WeightFormat &weight_format, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *d, + const AsmGemmInfo &info); + /** Checks if activation is supported by the gemm assembly dispatcher + * + * @param[in] activation Activation to check + * + * @return True if activation is supported else false + */ + static bool is_activation_supported(const ActivationLayerInfo &activation); + /** Was the function successfully configured ? + * + * @return True if the function is configured and ready to run + */ + bool is_configured() const; + /** Indicates if the convolution executes in variable weights mode. + * + * Similar to @ref CpuGemm::isVarWeightsKernel + */ + bool isVarWeightsKernel() const + { + return _arm_gemm && _arm_gemm->isVarWeightsKernel(); + } + + // Inherited methods overridden: + void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + std::unique_ptr<IFallback> _arm_gemm; /**< Interface for the arm_gemm fallback */ +}; +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_OPERATORS_INTERNAL_CPUGEMMASSEMBLYDISPATCH_H |