diff options
Diffstat (limited to 'src/gpu/cl/operators')
76 files changed, 10369 insertions, 0 deletions
diff --git a/src/gpu/cl/operators/ClActivation.cpp b/src/gpu/cl/operators/ClActivation.cpp new file mode 100644 index 0000000000..66877ebcec --- /dev/null +++ b/src/gpu/cl/operators/ClActivation.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClActivation.h" + +#include "src/common/IOperator.h" +#include "src/common/utils/LegacySupport.h" +#include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/ClContext.h" +#include "src/gpu/cl/kernels/ClActivationKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClActivation::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst, act_info); + auto k = std::make_unique<kernels::ClActivationKernel>(); + k->configure(compile_context, src, dst, act_info); + _kernel = std::move(k); +} + +Status ClActivation::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + return kernels::ClActivationKernel::validate(src, dst, act_info); +} +} // namespace opencl + +namespace gpu +{ +namespace opencl +{ +std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensorDescriptor &src, + const AclTensorDescriptor &dst, + const AclActivationDescriptor &act, + bool is_validate) +{ + TensorInfo src_info = detail::convert_to_legacy_tensor_info(src); + TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst); + auto info = detail::convert_to_activation_info(act); + + if (is_validate && !bool(arm_compute::opencl::ClActivation::validate(&src_info.set_is_resizable(false), + &dst_info.set_is_resizable(false), info))) + { + return std::make_tuple(nullptr, StatusCode::UnsupportedConfig); + } + + auto act_op = std::make_unique<arm_compute::opencl::ClActivation>(); + act_op->configure(CLKernelLibrary::get().get_compile_context(), &src_info, &dst_info, info); + + auto op = new arm_compute::IOperator(static_cast<IContext *>(this)); + if (op == nullptr) + { + ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources"); + return std::make_tuple(nullptr, StatusCode::OutOfMemory); + } + op->set_internal_operator(std::move(act_op)); + + return std::make_tuple(op, StatusCode::Success); +} +} // namespace opencl +} // namespace gpu +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClActivation.h b/src/gpu/cl/operators/ClActivation.h new file mode 100644 index 0000000000..4f25bb5f24 --- /dev/null +++ b/src/gpu/cl/operators/ClActivation.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_ACTIVATION_H +#define ARM_COMPUTE_CL_ACTIVATION_H + +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClActivationKernel */ +class ClActivation : public IClOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32. + * @param[out] dst Destination tensor info. Data type supported: same as @p src + * @param[in] activation_info Activation layer parameters. + */ + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const ActivationLayerInfo &activation_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClActivation::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_ACTIVATION_H */ diff --git a/src/gpu/cl/operators/ClAdd.cpp b/src/gpu/cl/operators/ClAdd.cpp new file mode 100644 index 0000000000..b58d0df58d --- /dev/null +++ b/src/gpu/cl/operators/ClAdd.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClAdd.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClAdd::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, policy, act_info); + auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>(); + k->configure(compile_context, ArithmeticOperation::ADD, src1, src2, dst, policy, act_info); + _kernel = std::move(k); +} + +Status ClAdd::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::ADD, src1, src2, dst, policy, act_info); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClAdd.h b/src/gpu/cl/operators/ClAdd.h new file mode 100644 index 0000000000..7aed902f5d --- /dev/null +++ b/src/gpu/cl/operators/ClAdd.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_ADD_H +#define ARM_COMPUTE_CL_ADD_H + +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run arithmetic addition + * + * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * @note The function performs an arithmetic addition between two tensors. + */ +class ClAdd : public IClOperator +{ +public: + /** Configure function for a given list of arguments. + * + * Valid configurations (src1,src2) -> dst : + * + * - (U8,U8) -> U8 + * - (U8,U8) -> S16 + * - (S16,U8) -> S16 + * - (U8,S16) -> S16 + * - (S16,S16) -> S16 + * - (S32,S32) -> S32 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * - (QASYMM8,QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (QSYMM16,QSYMM16) -> QSYMM16 + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] src2 Second source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] dst Destination tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * @param[in] policy Policy to use to handle overflow. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClAdd::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_ADD_H */ diff --git a/src/gpu/cl/operators/ClCast.cpp b/src/gpu/cl/operators/ClCast.cpp new file mode 100644 index 0000000000..8f26ef003d --- /dev/null +++ b/src/gpu/cl/operators/ClCast.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClCast.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClCastKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClCast::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + ConvertPolicy policy) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst, policy); + auto k = std::make_unique<kernels::ClCastKernel>(); + k->configure(compile_context, src, dst, policy); + _kernel = std::move(k); +} + +Status ClCast::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy) +{ + return kernels::ClCastKernel::validate(src, dst, policy); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClCast.h b/src/gpu/cl/operators/ClCast.h new file mode 100644 index 0000000000..25d2293673 --- /dev/null +++ b/src/gpu/cl/operators/ClCast.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_CAST_H +#define ARM_COMPUTE_CL_CAST_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClCastKernel */ +class ClCast : public IClOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @note Input data type must be different than output data type. + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------------------------------| + * |U8 | S8, U16, S16, U32, S32, F16, F32 | + * |U16 | U8, S8, S16, U32, S32, F16, F32 | + * |S16 | U8, S8, U16, U32, S32, F16, F32 | + * |U32 | U8, S8, U16, S16, S32, F16, F32 | + * |S32 | U8, S8, U16, S16, U32, F16, F32 | + * |F16 | U8, S8, U16, S16, U32, F32 | + * |F32 | U8, S8, U16, S16, U32, F16 | + * + * @param[in] compile_context The compile context to be used. + * @param[in] src The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. + * @param[out] dst The destinatio tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. + * @param[in] policy Conversion policy. + */ + void + configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClCast::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_CAST_H */ diff --git a/src/gpu/cl/operators/ClConcatenate.cpp b/src/gpu/cl/operators/ClConcatenate.cpp new file mode 100644 index 0000000000..31018b9768 --- /dev/null +++ b/src/gpu/cl/operators/ClConcatenate.cpp @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClConcatenate.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/gpu/cl/kernels/ClBatchConcatenateKernel.h" +#include "src/gpu/cl/kernels/ClDepthConcatenateKernel.h" +#include "src/gpu/cl/kernels/ClHeightConcatenateKernel.h" +#include "src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h" +#include "src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h" +#include "src/gpu/cl/kernels/ClWidthConcatenateKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClConcatenate::configure(const CLCompileContext &compile_context, + const std::vector<ITensorInfo *> &src_vector, + ITensorInfo *dst, + size_t axis) +{ + ARM_COMPUTE_ERROR_ON(dst == nullptr); + ARM_COMPUTE_LOG_PARAMS(src_vector, dst, axis); + _axis = axis; + _num_inputs = src_vector.size(); + + TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis); + std::vector<const ITensorInfo *> const_src_vector(src_vector.size()); + std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(), + [](ITensorInfo *t) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(t); + return t; + }); + + // dst auto inizialitation if not yet initialized + auto_init_if_empty(*dst, dst_shape, 1, src_vector[0]->data_type()); + ARM_COMPUTE_ERROR_THROW_ON(ClConcatenate::validate(const_src_vector, dst, axis)); + + unsigned int offset = 0; + switch (_axis) + { + case Window::DimX: + { + switch (_num_inputs) + { + case 2: + { + // Configure WidthConcatenate2Tensors kernel + auto kernel = std::make_unique<kernels::ClWidthConcatenate2TensorsKernel>(); + kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), dst); + _concat_kernels.emplace_back(std::move(kernel)); + break; + } + case 4: + { + // Configure WidthConcatenate4Tensors kernel + auto kernel = std::make_unique<kernels::ClWidthConcatenate4TensorsKernel>(); + kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2), + src_vector.at(3), dst); + _concat_kernels.emplace_back(std::move(kernel)); + break; + } + default: + { + // Configure generic case WidthConcatenate kernels + for (unsigned int i = 0; i < _num_inputs; ++i) + { + auto kernel = std::make_unique<kernels::ClWidthConcatenateKernel>(); + kernel->configure(compile_context, src_vector.at(i), offset, dst); + offset += src_vector.at(i)->dimension(_axis); + _concat_kernels.emplace_back(std::move(kernel)); + } + break; + } + } + break; + } + case Window::DimY: + { + for (unsigned int i = 0; i < _num_inputs; ++i) + { + auto kernel = std::make_unique<kernels::ClHeightConcatenateKernel>(); + kernel->configure(compile_context, src_vector.at(i), offset, dst); + offset += src_vector.at(i)->dimension(_axis); + _concat_kernels.emplace_back(std::move(kernel)); + } + break; + } + case Window::DimZ: + { + for (unsigned int i = 0; i < _num_inputs; ++i) + { + auto kernel = std::make_unique<kernels::ClDepthConcatenateKernel>(); + kernel->configure(compile_context, src_vector.at(i), offset, dst); + offset += src_vector.at(i)->dimension(_axis); + _concat_kernels.emplace_back(std::move(kernel)); + } + break; + } + case 3: + { + for (unsigned int i = 0; i < _num_inputs; ++i) + { + auto kernel = std::make_unique<kernels::ClBatchConcatenateKernel>(); + kernel->configure(compile_context, src_vector.at(i), offset, dst); + offset += src_vector.at(i)->dimension(_axis); + _concat_kernels.emplace_back(std::move(kernel)); + } + break; + } + default: + ARM_COMPUTE_ERROR("Axis not supported"); + } +} + +Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vector, const ITensorInfo *dst, size_t axis) +{ + ARM_COMPUTE_RETURN_ERROR_ON(dst == nullptr); + const unsigned int num_inputs = src_vector.size(); + + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); + ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2); + + unsigned int offset = 0; + switch (axis) + { + case Window::DimX: + { + switch (num_inputs) + { + case 2: + // Validate WidthConcatenate2Tensors kernels if there are 2 inputs + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1]); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst)); + break; + case 4: + // Validate WidthConcatenate4Tensors kernels if there are 4 inputs + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1], src_vector[2], src_vector[3]); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate( + src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst)); + break; + default: + // Validate generic case of WidthConcatenate kernel + for (const auto &src : src_vector) + { + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenateKernel::validate(src, offset, dst)); + offset += src->dimension(axis); + } + break; + } + break; + } + case Window::DimY: + { + for (const auto &src : src_vector) + { + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClHeightConcatenateKernel::validate(src, offset, dst)); + offset += src->dimension(axis); + } + break; + } + case Window::DimZ: + { + for (const auto &src : src_vector) + { + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDepthConcatenateKernel::validate(src, offset, dst)); + offset += src->dimension(axis); + } + break; + } + case 3: + { + for (const auto &src : src_vector) + { + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClBatchConcatenateKernel::validate(src, offset, dst)); + offset += src->dimension(axis); + } + break; + } + default: + ARM_COMPUTE_ERROR("Axis not supported"); + } + + if (dst->total_size() != 0) + { + TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, axis); + ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size()); + } + + return Status{}; +} + +void ClConcatenate::run(ITensorPack &tensors) +{ + if (tensors.empty()) + { + ARM_COMPUTE_ERROR("No inputs provided"); + } + + if (static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs)) + { + ARM_COMPUTE_ERROR("Configured with different number of inputs"); + } + + if (_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4)) + { + ARM_COMPUTE_ERROR_ON(_concat_kernels.empty()); + CLScheduler::get().enqueue_op(*_concat_kernels.at(0), tensors, true); + } + else + { + int i = 0; + for (auto &k : _concat_kernels) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i)); + pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST)); + CLScheduler::get().enqueue_op(*k, pack, true); + ++i; + } + } +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClConcatenate.h b/src/gpu/cl/operators/ClConcatenate.h new file mode 100644 index 0000000000..d8ce9d2a5c --- /dev/null +++ b/src/gpu/cl/operators/ClConcatenate.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CLCONCATENATE_H +#define ARM_COMPUTE_CLCONCATENATE_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" +#include "src/gpu/cl/IClOperator.h" + +#include <vector> + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels: + * + * -# @ref kernels::ClWidthConcatenateKernel (if underlying concatenation axis is 0). + * -# @ref kernels::ClHeightConcatenateKernel (if underlying concatenation axis is 1). + * -# @ref kernels::ClDepthConcatenateKernel (if underlying concatenation axis is 2). + * -# @ref kernels::ClBatchConcatenateKernel (if underlying concatenation axis is 3). + */ +class ClConcatenate : public IClOperator +{ +public: + ClConcatenate() = default; + /** Initialise the kernel's inputs vector and dst. + * + * @note Input and dst tensor dimensions preconditions defer depending on the concatenation axis. + * @note Preconditions can be found respectively at @ref kernels::ClWidthConcatenateKernel, + * @ref kernels::ClHeightConcatenateKernel and @ref kernels::ClDepthConcatenateKernel. + * + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] src_vector The vectors containing all the tensors info to concatenate. Data types supported: All + * @param[out] dst Destination tensor info. Data types supported: same as @p src_vector. + * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3. + */ + void configure(const ClCompileContext &compile_context, + const std::vector<ITensorInfo *> &src_vector, + ITensorInfo *dst, + size_t axis); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClConcatenate::configure() + * + * @return a status + */ + static Status validate(const std::vector<const ITensorInfo *> &src_vector, const ITensorInfo *dst, size_t axis); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + +private: + std::vector<std::unique_ptr<IClKernel>> _concat_kernels{}; + unsigned int _num_inputs{0}; + unsigned int _axis{0}; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_CONCATENATE_H */ diff --git a/src/gpu/cl/operators/ClConv2d.cpp b/src/gpu/cl/operators/ClConv2d.cpp new file mode 100644 index 0000000000..2c3b0214fa --- /dev/null +++ b/src/gpu/cl/operators/ClConv2d.cpp @@ -0,0 +1,432 @@ +/* + * Copyright (c) 2021-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClConv2d.h" + +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/operators/ClDirectConv2d.h" +#include "src/gpu/cl/operators/ClGemmConv2d.h" +#include "src/gpu/cl/operators/ClIndirectConv2d.h" +#include "src/gpu/cl/operators/ClWinogradConv2d.h" + +#include <memory> + +namespace +{ +/** Get the suitable kernel size for using direct convolution method with NHWC data layout. + * + * @note Direct convolution should be executed when the kernel has the spatial dimensions greater than or equal to the value returned by this function + * + * @param[in] gpu_target GPU target + * + * @return the suitable kernel size for using direct convolution method with NHWC data layout + */ +size_t get_direct_conv_kernel_threshold_nhwc(arm_compute::GPUTarget gpu_target) +{ + switch (gpu_target) + { + case arm_compute::GPUTarget::G76: + case arm_compute::GPUTarget::G77: + case arm_compute::GPUTarget::G78: + return 5; + case arm_compute::GPUTarget::G71: + case arm_compute::GPUTarget::G72: + case arm_compute::GPUTarget::MIDGARD: + case arm_compute::GPUTarget::BIFROST: + return 7; + default: + return 5; + } +} +} // namespace + +namespace arm_compute +{ +namespace opencl +{ +using namespace arm_compute::misc::shape_calculator; + +ClConv2d::ClConv2d() : _operator() +{ +} + +ClConv2d::~ClConv2d() = default; + +void ClConv2d::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_ERROR_THROW_ON( + ClConv2d::validate(src, weights, ((biases != nullptr) ? biases : nullptr), dst, conv2d_info, weights_info)); + ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv2d_info, weights_info); + + switch (ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, CLScheduler::get().target())) + { + case ConvolutionMethod::WINOGRAD: + { + ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1); + auto f = std::make_unique<ClWinogradConv2d>(); + f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, + conv2d_info.enable_fast_math); + _operator = std::move(f); + break; + } + case ConvolutionMethod::DIRECT: + { + ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1); + auto f = std::make_unique<ClDirectConv2d>(); + f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info); + _operator = std::move(f); + break; + } + case ConvolutionMethod::INDIRECT: + { + ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1); + auto f = std::make_unique<ClIndirectConv2d>(); + f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info); + _operator = std::move(f); + break; + } + case ConvolutionMethod::GEMM: + { + auto f = std::make_unique<ClGemmConv2d>(); + f->configure(compile_context, src, weights, biases, dst, conv2d_info, weights_info); + _operator = std::move(f); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported."); + break; + } + _aux_mem = _operator->workspace(); +} + +Status ClConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), + "Grouping (num_groups != 1) with NHWC data layout is not supported"); + + const GPUTarget gpu_target = CLScheduler::get().target(); + + switch (ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, gpu_target)) + { + case ConvolutionMethod::WINOGRAD: + { + //Validate Winograd + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, + "Grouping (num_groups != 1) with ClWinogradConv2d is not supported"); + ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, + conv2d_info.act_info, conv2d_info.enable_fast_math)); + break; + } + case ConvolutionMethod::DIRECT: + { + // Validate direct convolution layer + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, + "Grouping (num_groups != 1) with ClDirectConv2d is not supported"); + ARM_COMPUTE_RETURN_ON_ERROR( + ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info)); + break; + } + case ConvolutionMethod::INDIRECT: + { + // Validate indirect convolution layer + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, + "Grouping (num_groups != 1) with ClIndirectConv2d is not supported"); + ARM_COMPUTE_RETURN_ON_ERROR( + ClIndirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info)); + break; + } + case ConvolutionMethod::GEMM: + { + // Validate gemm-based convolution layer + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmConv2d::validate(src, weights, biases, dst, conv2d_info, weights_info)); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported."); + break; + } + + return Status{}; +} + +ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info, + const GPUTarget gpu_target) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_ERROR_ON_NULLPTR(dst); + ARM_COMPUTE_ERROR_ON_NULLPTR(weights); + ARM_COMPUTE_UNUSED(weights_info); + + const PadStrideInfo conv_info = conv2d_info.conv_info; + const ActivationLayerInfo act_info = conv2d_info.act_info; + const Size2D dilation = conv2d_info.dilation; + bool enable_fast_math = conv2d_info.enable_fast_math; + + const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT); + const size_t idx_c = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); + + /* Input spatial dims, kernel size, IFM/OFM, conv info*/ + using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo, DataLayout>; + using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>; + + const std::vector<ConfigurationMethod> known_configs = { + // Alexnet + ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), + PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), + ConvolutionMethod::DIRECT), + // VGG16 / VGG19 + ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), + PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), + ConvolutionMethod::DIRECT), + // Mobilenet 224 + ConfigurationMethod(ConvolutionConfiguration( + Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), + ConvolutionMethod::GEMM), + // Mobilenet 160 + ConfigurationMethod(ConvolutionConfiguration( + Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), + ConvolutionMethod::GEMM), + // Mobilenet 224 + ConfigurationMethod(ConvolutionConfiguration( + Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), + ConvolutionMethod::GEMM), + // Mobilenet 160 + ConfigurationMethod(ConvolutionConfiguration( + Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), + ConvolutionMethod::GEMM), + }; + + const auto find_config = [&](ConfigurationMethod c) + { + const ConvolutionConfiguration config = c.first; + const PadStrideInfo info = std::get<3>(config); + const DataLayout data_layout = std::get<4>(config); + + return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) && + std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) && + std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && + info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() && + info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && + info.stride() == conv_info.stride() && (data_layout == src->data_layout()); + }; + + std::vector<ConfigurationMethod>::const_iterator found; + if ((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) + { + return (*found).second; + } + + if (dilation != Size2D(1U, 1U)) + { + return ConvolutionMethod::GEMM; + } + else + { + if (src->data_layout() == DataLayout::NCHW) + { + // SRGAN + if ((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && + (conv_info.pad_top() < 3) && + (ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info))) + { + return ConvolutionMethod::DIRECT; + } + if ((weights->dimension(idx_h) > 5) && (src->dimension(idx_c) > dst->dimension(idx_c)) && + (CLFFTConvolutionLayer::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math))) + { + return ConvolutionMethod::FFT; + } + if (src->dimension(idx_c) < 16) + { + return ConvolutionMethod::GEMM; + } + return bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)) + ? ConvolutionMethod::WINOGRAD + : ConvolutionMethod::GEMM; + } + else + { + const bool is_direct_valid = + bool(ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)); + const bool is_wino_valid = + bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)); + const size_t kernel_sz_direct_conv_thr = get_direct_conv_kernel_threshold_nhwc(gpu_target); + + // SRGAN case + if ((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && + (conv_info.pad_top() < 3) && is_direct_valid) + { + return ConvolutionMethod::DIRECT; + } + + // Floating-point case: GeMM/Direct/Winograd + if (is_data_type_float(src->data_type())) + { + // Get dst shape + TensorShape output_shape = + misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); + const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && + (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr); + const bool is_ifm_ge_8 = src->dimension(idx_c) >= 8; + const bool is_ifm_ge_16 = src->dimension(idx_c) >= 16; + const bool is_ofm_lte_8 = weights->dimension(3U) <= 8; + const bool is_ofm_lt_64 = weights->dimension(3U) < 64; + const bool workload_gte_8192 = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192; + const bool is_ifm_gt_ofm = src->dimension(idx_c) > weights->dimension(3U); + const bool is_m_one = output_shape[1] * output_shape[2] == 1; + const bool is_unit_stride = + (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1); + const int32_t kernel_sz = weights->dimension(idx_w) * weights->dimension(idx_h); + + // Run Winograd if valid and IFM >= 8 + if (is_wino_valid && is_ifm_ge_8) + { + if (is_ofm_lte_8) + { + if (gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || + get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD) + { + return ConvolutionMethod::WINOGRAD; + } + } + else + { + return ConvolutionMethod::WINOGRAD; + } + } + + // Direct convolution case + if (is_direct_valid) + { + if ((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || + get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD)) + { + if (is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm) + { + return ConvolutionMethod::DIRECT; + } + } + else if (gpu_target == arm_compute::GPUTarget::G76) + { + if ((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16)) + { + return ConvolutionMethod::DIRECT; + } + } + else + { + ConvolutionMethod preferred_conv_method = ConvolutionMethod::DIRECT; + + const bool is_indirect_valid = + bool(ClIndirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)); + + // indirect conv2d should be called when: + // 1- When the kernel size is greater than 1x1 and less than or equal to 9x9 (81) + // 2- When the kernel size is odd + // 3- When the Gpu target is Arm Mali-G77 + if (is_indirect_valid) + { + const bool is_kernel_sz_odd = kernel_sz % 2; + const bool is_g77 = gpu_target == GPUTarget::G77; + preferred_conv_method = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77 + ? ConvolutionMethod::INDIRECT + : ConvolutionMethod::DIRECT; + } + + // Direct/indirect convolution used for the first layer of the network + if (workload_gte_8192 && !is_ifm_ge_16 && !is_unit_stride && is_ofm_lt_64) + { + // In general, the question we should ask for the first convolution layer of a model is: + // when the execution time of im2col + gemm < direct?. Since im2col does not depend on the OFM, it means that + // when OFM is big enough, the contribution of im2col is small and the GEMM approach is preferable. + // From internal experiments, the OFM threshold is 64 (is_ofm_lt_64) + return preferred_conv_method; + } + + if ((is_large_kernel_sz || is_m_one) && workload_gte_8192 && is_ifm_ge_16) + { + return preferred_conv_method; + } + + // Direct convolution used for the last layer of the network + if (is_ofm_lte_8) + { + return preferred_conv_method; + } + } + } + + // Default case + return ConvolutionMethod::GEMM; + } + + // Generic case for quantized. Only GeMM + return ConvolutionMethod::GEMM; + } + } +} + +void ClConv2d::run(ITensorPack &tensors) +{ + prepare(tensors); + _operator->run(tensors); +} + +void ClConv2d::prepare(ITensorPack &tensors) +{ + _operator->prepare(tensors); +} + +experimental::MemoryRequirements ClConv2d::workspace() const +{ + return _aux_mem; +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClConv2d.h b/src/gpu/cl/operators/ClConv2d.h new file mode 100644 index 0000000000..0cf3cbc1ce --- /dev/null +++ b/src/gpu/cl/operators/ClConv2d.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CLCONV2D_H +#define ARM_COMPUTE_CLCONV2D_H + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/FunctionDescriptors.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to compute the convolution layer. This function calls the following OpenCL kernels/functions: + * + * -# @ref opencl::ClGemmConv2d + * -# @ref opencl::ClWinogradConv2d + * -# @ref opencl::ClIndirectConv2d + * -# @ref opencl::ClDirectConv2d + * -# @ref CLFFTConvolutionLayer + * + * The function selects one of the algorithms mentioned above based on: + * - The size of the kernel + * - Number of src/dst feature maps + * - Amount of memory needed + * + * Generally GEMM-based convolution is executed when neither Winograd nor FFT nor Direct convolution can be performed. + * + * FP32 Algorithm| Filter Size | Input/Output feature maps | + * --------------|-------------------------------------------------------------|-------------------------------------------| + * Winograd | 3x3 1x3 3x1 5x1 1x5 5x5(fast maths) 7x1 1x7 | Input channels is greater than 3 | + * FFT | Squared kernels and greater than 9x9 | Input feature maps > Output feature maps | + * DirectConv | 9x9 | | + * GEMM | Any size | | + * + * Winograd 5x5 requires fast maths enabled. + * + * FP16 Algorithm| Filter Size | Input/Output feature maps | + * --------------|----------------------------|-------------------------------------------| + * Winograd | 3x3 1x3 3x1 5x1 1x5 5x5 | Input channels is greater than 3 | + * FFT | Not supported | | + * DirectConv | 9x9 | | + * GEMM | Any size | | + * + * Winograd FP16 requires fast maths enabled. + * + */ +class ClConv2d : public IClOperator +{ +public: + /** Default constructor */ + ClConv2d(); + /** Default Destructor */ + ~ClConv2d(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + ClConv2d(const ClConv2d &) = delete; + /** Default move constructor */ + ClConv2d(ClConv2d &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + ClConv2d &operator=(const ClConv2d &) = delete; + /** Default move assignment operator */ + ClConv2d &operator=(ClConv2d &&) = default; + /** Set the src and dst tensors. + * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. 3 lower dimensions represent a single src [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of srcs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: Same as @p src, also could be QSYMM8_PER_CHANNEL if src is QASYMM8/QASYMM8_SIGNED. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Same as @p src, except for src of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * @param[out] dst Destination tensor info. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts. + * Data types supported: Same as @p src. + * @param[in] conv2d_info Contains convolution 2d info described in @ref Conv2dInfo. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p src. + */ + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info = WeightsInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref ClConv2d + * + * Similar to ClConv2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info = WeightsInfo()); + /** Static function to check if given info will return the convolution called by @ref ClConv2d + * + * @param[in] src Source tensor. 3 lower dimensions represent a single src [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of srcs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: Same as @p src, also could be QSYMM8_PER_CHANNEL if src is QASYMM8/QASYMM8_SIGNED. + * @param[in] dst Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts. + * Data types supported: Same as @p src. + * @param[in] conv2d_info Contains convolution 2d info described in @ref Conv2dInfo. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. + * @param[in] gpu_target Specifies the @p GPUTarget. + * + * @return the Convolution Method Hint + */ + static ConvolutionMethod get_convolution_method(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info, + const GPUTarget gpu_target); + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + std::unique_ptr<IClOperator> _operator; + experimental::MemoryRequirements _aux_mem{}; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CLCONV2D_H */ diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp new file mode 100644 index 0000000000..cf24c68d21 --- /dev/null +++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout); + auto k = std::make_unique<kernels::ClConvertFullyConnectedWeightsKernel>(); + k->configure(compile_context, src, dst, original_src_shape, data_layout); + _kernel = std::move(k); +} + +Status ClConvertFullyConnectedWeights::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout) +{ + return kernels::ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h new file mode 100644 index 0000000000..c46152081c --- /dev/null +++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_H +#define ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClConvertFullyConnectedWeightsKernel */ +class ClConvertFullyConnectedWeights : public IClOperator +{ +public: + /** Initialise the kernel's inputs and outputs + * + * @param[in] compile_context The compile context to be used. + * @param[in] src The src tensor info. Data types supported: All. + * @param[in] dst The dst tensor info. Data types supported: Same as @p src + * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer). + * @param[in] data_layout The data layout the weights have been trained in. + */ + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClConvertFullyConnectedWeights::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_H */ diff --git a/src/gpu/cl/operators/ClCopy.cpp b/src/gpu/cl/operators/ClCopy.cpp new file mode 100644 index 0000000000..e2be7cebd4 --- /dev/null +++ b/src/gpu/cl/operators/ClCopy.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClCopy.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClCopyKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClCopy::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, Window *dst_window) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst, dst_window); + auto k = std::make_unique<kernels::ClCopyKernel>(); + k->configure(compile_context, src, dst, dst_window); + _kernel = std::move(k); +} + +Status ClCopy::validate(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window) +{ + return kernels::ClCopyKernel::validate(src, dst, dst_window); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClCopy.h b/src/gpu/cl/operators/ClCopy.h new file mode 100644 index 0000000000..fe9b58c607 --- /dev/null +++ b/src/gpu/cl/operators/ClCopy.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_COPY_H +#define ARM_COMPUTE_CL_COPY_H + +#include "arm_compute/core/Window.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClCopyKernel */ +class ClCopy : public IClOperator +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: All. + * @param[out] dst Output tensor info. Data types supported: Same as @p src. + * @param[in] dst_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr. + * + */ + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + Window *dst_window = nullptr); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClCopy::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window = nullptr); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_COPY_H */ diff --git a/src/gpu/cl/operators/ClCrop.cpp b/src/gpu/cl/operators/ClCrop.cpp new file mode 100644 index 0000000000..6313e4fbb5 --- /dev/null +++ b/src/gpu/cl/operators/ClCrop.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClCrop.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClCropKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClCrop::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window); + auto k = std::make_unique<kernels::ClCropKernel>(); + k->configure(compile_context, src, dst, start, end, batch_index, extrapolation_value, dst_window); + _kernel = std::move(k); +} + +Status ClCrop::validate(const ITensorInfo *src, + const ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) +{ + return kernels::ClCropKernel::validate(src, dst, start, end, batch_index, extrapolation_value, dst_window); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClCrop.h b/src/gpu/cl/operators/ClCrop.h new file mode 100644 index 0000000000..e845cf372c --- /dev/null +++ b/src/gpu/cl/operators/ClCrop.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_CROP_H +#define ARM_COMPUTE_CL_CROP_H + +#include "arm_compute/core/Window.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClCropKernel */ +class ClCrop : public IClOperator +{ +public: + /** Initialise the function's source and destination. + * + * @note Supported tensor rank: up to 4 + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data type supported: All. Data layouts supported: NHWC. + * @param[out] dst Destination tensor info. Data type supported: F32 + * @param[in] start Coordinates of where to start cropping the image. + * @param[in] end Coordinates of where to end cropping the image. + * @param[in] batch_index Fourth dimension index of the 3D image to crop in @p src. + * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0. + * @param[in] dst_window Output window to be used in case cropped image is being copied into a tensor. Default is nullptr. + */ + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value = 0, + Window *dst_window = nullptr); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClCrop::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value = 0, + Window *dst_window = nullptr); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_CROP_H */ diff --git a/src/gpu/cl/operators/ClDequantize.cpp b/src/gpu/cl/operators/ClDequantize.cpp new file mode 100644 index 0000000000..eb6f9e7abb --- /dev/null +++ b/src/gpu/cl/operators/ClDequantize.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClDequantize.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClDequantizeKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClDequantize::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::ClDequantizeKernel>(); + k->configure(compile_context, src, dst); + _kernel = std::move(k); +} + +Status ClDequantize::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClDequantizeKernel::validate(src, dst); +} + +void ClDequantize::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + CLScheduler::get().enqueue_op(*_kernel.get(), tensors); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClDequantize.h b/src/gpu/cl/operators/ClDequantize.h new file mode 100644 index 0000000000..ccaac2cd49 --- /dev/null +++ b/src/gpu/cl/operators/ClDequantize.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_DEQUANTIZE_H +#define ARM_COMPUTE_CL_DEQUANTIZE_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClDequantizeKernel that dequantizes an input tensor */ +class ClDequantize : public IClOperator +{ +public: + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16. + * @param[out] dst Destination tensor info with the same dimensions of @p src. Data type supported: F16/F32. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClDequantize::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited method overridden + void run(ITensorPack &tensors) override; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_DEQUANTIZE_H */ diff --git a/src/gpu/cl/operators/ClDirectConv2d.cpp b/src/gpu/cl/operators/ClDirectConv2d.cpp new file mode 100644 index 0000000000..17a196ce6b --- /dev/null +++ b/src/gpu/cl/operators/ClDirectConv2d.cpp @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2021-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClDirectConv2d.h" + +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/gpu/cl/kernels/ClActivationKernel.h" +#include "src/gpu/cl/kernels/ClDirectConv2dKernel.h" +#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h" +#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h" +#include "src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h" +#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h" + +using namespace arm_compute::cl_direct_conv; + +namespace arm_compute +{ +namespace opencl +{ +namespace +{ +ITensorPack select_activation_src_dst(ITensorPack &tensors) +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, tensors.get_tensor(TensorType::ACL_DST)); + pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(TensorType::ACL_DST)); + return pack; +} + +DirectConvComputeKernelInfo +config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info) +{ + // Get GPU target + GPUTarget gpu_target = CLScheduler::get().target(); + + std::unique_ptr<IClDirectConvKernelConfig> t = ClDirectConvKernelConfigurationFactory::create(gpu_target); + + return t->configure(src, weights, conv_info); +} + +} // namespace + +void ClDirectConv2d::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info); + + // Initialize the direct convolution descriptor + const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, weights, conv_info); + + // Configure direct convolution kernel + const ActivationLayerInfo conv2d_act_info = + (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info + : ActivationLayerInfo(); + auto k = std::make_unique<kernels::ClDirectConv2dKernel>(); + k->set_target(CLScheduler::get().target()); + k->configure(compile_context, src, weights, biases, dst, conv_info, conv2d_act_info, desc); + _direct_conv_kernel = std::move(k); + + // Configure border handler + PixelValue zero_value(0.f); + if (is_data_type_quantized_asymmetric(src->data_type())) + { + zero_value = PixelValue(0, src->data_type(), src->quantization_info()); + } + auto b = std::make_unique<CLFillBorderKernel>(); + b->configure(compile_context, src, _direct_conv_kernel->border_size(), BorderMode::CONSTANT, zero_value); + _src_border_handler = std::move(b); + + // Fused activation is currently supported for NHWC and floating point types + if (act_info.enabled() && !conv2d_act_info.enabled()) + { + auto a = std::make_unique<kernels::ClActivationKernel>(); + a->configure(compile_context, dst, dst, act_info); + _activation_kernel = std::move(a); + } + + // Tune kernels + CLScheduler::get().tune_kernel_static(*_direct_conv_kernel); +} + +Status ClDirectConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) +{ + // Initialize the direct convolution descriptor + const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, weights, conv_info); + + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), desc)); + if (act_info.enabled()) + { + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info)); + } + return Status{}; +} + +void ClDirectConv2d::run(ITensorPack &tensors) +{ + // Run border handler + CLScheduler::get().enqueue_op(*_src_border_handler.get(), tensors, false); + // Run direct convolution + CLScheduler::get().enqueue_op(*_direct_conv_kernel.get(), tensors, false); + // Run activation kernel + if (_activation_kernel) + { + auto act_pack = select_activation_src_dst(tensors); + CLScheduler::get().enqueue_op(*_activation_kernel.get(), act_pack, false); + } +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClDirectConv2d.h b/src/gpu/cl/operators/ClDirectConv2d.h new file mode 100644 index 0000000000..0f18490814 --- /dev/null +++ b/src/gpu/cl/operators/ClDirectConv2d.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_H +#define ARM_COMPUTE_CL_DIRECT_CONV2D_H + +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" +#include "src/gpu/cl/IClOperator.h" + +#include <memory> + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to simulate a directly convolution layer. This function calls the following OpenCL kernels: + * + * -# @ref CLFillBorderKernel (executed if padding size is different from zero) + * -# @ref opencl::ClDirectConv2d + */ +class ClDirectConv2d : public IClOperator +{ +public: + ClDirectConv2d() = default; + /** Set the src and dst tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor. 3 lower dimensions represent a single src [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of srcs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type. + * @param[out] dst Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts. + * Data types supported: Same as @p src. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * + */ + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClDirectConv2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited method overridden + void run(ITensorPack &tensors) override; + +private: + std::unique_ptr<IClKernel> _direct_conv_kernel{nullptr}; + std::unique_ptr<IClKernel> _src_border_handler{nullptr}; + std::unique_ptr<IClKernel> _activation_kernel{nullptr}; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_DIRECT_CONV2D_H */ diff --git a/src/gpu/cl/operators/ClDirectConv3d.cpp b/src/gpu/cl/operators/ClDirectConv3d.cpp new file mode 100644 index 0000000000..b08347936b --- /dev/null +++ b/src/gpu/cl/operators/ClDirectConv3d.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClDirectConv3d.h" + +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/gpu/cl/kernels/ClDirectConv3dKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClDirectConv3d::configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + const Conv3dInfo &conv3d_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0); + + // Configure direct convolution 3d kernel + auto k = std::make_unique<kernels::ClDirectConv3dKernel>(); + k->configure(compile_context, src0, src1, src2, dst, conv3d_info); + _direct_conv3d_kernel = std::move(k); +} + +Status ClDirectConv3d::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv3d_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv3dKernel::validate(src0, src1, src2, dst, conv3d_info)); + return Status{}; +} + +void ClDirectConv3d::run(ITensorPack &tensors) +{ + // Run direct convolution 3d + CLScheduler::get().enqueue_op(*_direct_conv3d_kernel.get(), tensors, true); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClDirectConv3d.h b/src/gpu/cl/operators/ClDirectConv3d.h new file mode 100644 index 0000000000..5fb32460e2 --- /dev/null +++ b/src/gpu/cl/operators/ClDirectConv3d.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_DIRECT_CONV3D_H +#define ARM_COMPUTE_CL_DIRECT_CONV3D_H + +#include "src/gpu/cl/IClKernel.h" +#include "src/gpu/cl/IClOperator.h" + +#include <memory> + +namespace arm_compute +{ +class CLCompileContext; +struct Conv3dInfo; +class IClKernel; + +namespace opencl +{ +/** Basic function to simulate a directly convolution layer with 3 spatial dimensions. This function calls the following OpenCL kernels: + * + * -# @ref opencl::ClDirectConv3d + */ +class ClDirectConv3d : public IClOperator +{ +public: + ClDirectConv3d() = default; + /** Set the src and dst tensors. + * + * Valid data layouts: + * - NDHWC + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:--------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * + * @param[in] compile_context The compile context to be used. + * @param[in] src0 Source tensor. 4 lower dimensions represent a single src [IFM, width, height, depth], + * while every optional dimension from 5 and above represent a batch of srcs. + * @param[in] src1 Weights tensor. Weights are 5D tensor with dimensions [OFM, IFM, kernel_w, kernel_h, kernel_d]. + * @param[in] src2 Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * @param[out] dst Destination tensor. 4 lower dimensions represent a single dst [OFM, width, height, depth], while the rest represent batch of dsts. + * @param[in] conv3d_info Contains strides, padding, rounding, activation, dilation and fast math information. Activation and fast math are currently unused. + * + */ + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + const Conv3dInfo &conv3d_info); + + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClDirectConv3d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv3d_info); + + // Inherited method overridden + void run(ITensorPack &tensors) override; + +private: + std::unique_ptr<IClKernel> _direct_conv3d_kernel{nullptr}; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_DIRECT_CONV3D_H */ diff --git a/src/gpu/cl/operators/ClElementwiseOperations.cpp b/src/gpu/cl/operators/ClElementwiseOperations.cpp new file mode 100644 index 0000000000..1325371d19 --- /dev/null +++ b/src/gpu/cl/operators/ClElementwiseOperations.cpp @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClElementwiseOperations.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClElementwiseDivision::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); + auto k = std::make_unique<kernels::ClArithmeticKernel>(); + k->configure(compile_context, ArithmeticOperation::DIV, src1, src2, dst, act_info); + _kernel = std::move(k); +} + +Status ClElementwiseDivision::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) +{ + return kernels::ClArithmeticKernel::validate(ArithmeticOperation::DIV, src1, src2, dst, act_info); +} + +void ClElementwiseMax::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); + auto k = std::make_unique<kernels::ClArithmeticKernel>(); + k->configure(compile_context, ArithmeticOperation::MAX, src1, src2, dst, act_info); + _kernel = std::move(k); +} + +Status ClElementwiseMax::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) +{ + return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MAX, src1, src2, dst, act_info); +} + +void ClElementwiseMin::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); + auto k = std::make_unique<kernels::ClArithmeticKernel>(); + k->configure(compile_context, ArithmeticOperation::MIN, src1, src2, dst, act_info); + _kernel = std::move(k); +} + +Status ClElementwiseMin::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) +{ + return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MIN, src1, src2, dst, act_info); +} + +void ClElementwiseSquaredDiff::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); + auto k = std::make_unique<kernels::ClArithmeticKernel>(); + k->configure(compile_context, ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info); + _kernel = std::move(k); +} + +Status ClElementwiseSquaredDiff::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) +{ + return kernels::ClArithmeticKernel::validate(ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info); +} + +void ClElementwisePower::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); + auto k = std::make_unique<kernels::ClArithmeticKernel>(); + k->configure(compile_context, ArithmeticOperation::POWER, src1, src2, dst, act_info); + _kernel = std::move(k); +} + +Status ClElementwisePower::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) +{ + return kernels::ClArithmeticKernel::validate(ArithmeticOperation::POWER, src1, src2, dst, act_info); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClElementwiseOperations.h b/src/gpu/cl/operators/ClElementwiseOperations.h new file mode 100644 index 0000000000..de7c018d75 --- /dev/null +++ b/src/gpu/cl/operators/ClElementwiseOperations.h @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H +#define ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H + +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for division + * + * @note The tensor data type for the inputs must be F16/F32. + * @note The function performs an arithmetic division between two tensors. + */ +class ClElementwiseDivision : public IClOperator +{ +public: + /** Configure function for a given list of arguments. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src1 First source tensor info. Data types supported: F16/F32. + * @param[in] src2 Second source tensor info. same as @p src1. + * @param[out] dst Destination tensor info. Data types supported: same as @p src1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClElementwiseDivision::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); +}; + +/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for max + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32. + * @note The function performs a max operation between two tensors. + */ +class ClElementwiseMax : public IClOperator +{ +public: + /** Configure function for a given list of arguments. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32. + * @param[in] src2 Second source tensor info. Data types supported: same as @p src1. + * @param[out] dst Destination tensor info. Data types supported: same as @p src1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClElementwiseMax::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); +}; + +/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for min + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32. + * @note The function performs a max operation between two tensors. + */ +class ClElementwiseMin : public IClOperator +{ +public: + /** Configure function for a given list of arguments. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32. + * @param[in] src2 Second source tensor info. Data types supported: same as @p src1. + * @param[out] dst Destination tensor info. Data types supported: same as @p src1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClElementwiseMin::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); +}; + +/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for squared difference + * + * @note The tensor data type for the inputs must be QASYMM8/U8/S16/QSYMM16/F16/F32. + * @note The function performs a squared different operation between two tensors (i.e., out[i] = (in1[i] - in2[i])^2 + */ +class ClElementwiseSquaredDiff : public IClOperator +{ +public: + /** Configure function for a given list of arguments. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. + * @param[in] src2 Second source tensor info. Data types supported: same as @p src1. + * @param[out] dst Destination tensor info. Data types supported: same as @p src1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClElementwiseSquaredDiff::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); +}; + +/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for power + * + * @note The tensor data type for the inputs must be F16/F32. + * @note The function performs an elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i]) + */ +class ClElementwisePower : public IClOperator +{ +public: + /** Configure function for a given list of arguments. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src1 First source tensor info. Data types supported: F16/F32. + * @param[in] src2 Second source tensor info. Data types supported: F16/F32. + * @param[out] dst Destination tensor info. Data types supported:F16/F32. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClElementwisePower::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H */ diff --git a/src/gpu/cl/operators/ClElementwiseUnary.cpp b/src/gpu/cl/operators/ClElementwiseUnary.cpp new file mode 100644 index 0000000000..914621183e --- /dev/null +++ b/src/gpu/cl/operators/ClElementwiseUnary.cpp @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClElementwiseUnary.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClRsqrt::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); + k->configure(compile_context, src, dst, ElementWiseUnary::RSQRT); + _kernel = std::move(k); +} + +Status ClRsqrt::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::RSQRT); +} + +void ClExp::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); + k->configure(compile_context, src, dst, ElementWiseUnary::EXP); + _kernel = std::move(k); +} + +Status ClExp::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::EXP); +} + +void ClNeg::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); + k->configure(compile_context, src, dst, ElementWiseUnary::NEG); + _kernel = std::move(k); +} + +Status ClNeg::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::NEG); +} + +void ClSin::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); + k->configure(compile_context, src, dst, ElementWiseUnary::SIN); + _kernel = std::move(k); +} + +Status ClSin::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::SIN); +} + +void ClAbs::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); + k->configure(compile_context, src, dst, ElementWiseUnary::ABS); + _kernel = std::move(k); +} + +Status ClAbs::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::ABS); +} + +void ClLog::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); + k->configure(compile_context, src, dst, ElementWiseUnary::LOG); + _kernel = std::move(k); +} + +Status ClLog::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::LOG); +} + +void ClRound::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); + k->configure(compile_context, src, dst, ElementWiseUnary::ROUND); + _kernel = std::move(k); +} + +Status ClRound::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::ROUND); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClElementwiseUnary.h b/src/gpu/cl/operators/ClElementwiseUnary.h new file mode 100644 index 0000000000..a23b789ab5 --- /dev/null +++ b/src/gpu/cl/operators/ClElementwiseUnary.h @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H +#define ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to perform inverse square root on an src tensor. */ +class ClRsqrt : public IClOperator +{ +public: + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + */ + void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClRsqrt::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; + +/** Basic function to perform exponential on an src tensor. */ +class ClExp : public IClOperator +{ +public: + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + */ + void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClExp::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; + +/** Basic function to negate an src tensor. */ +class ClNeg : public IClOperator +{ +public: + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + */ + void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClNeg::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; + +/** Basic function to calculate sine of an src tensor. */ +class ClSin : public IClOperator +{ +public: + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + */ + void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClSin::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; + +/** Basic function to perform elementwise log on an src tensor. */ +class ClLog : public IClOperator +{ +public: + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + */ + void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClLog::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; + +/** Basic function to get the absolute value of an src tensor. */ +class ClAbs : public IClOperator +{ +public: + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + */ + void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClAbs::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; + +/** Basic function to get the round (to the nearest even) value of an src tensor. */ +class ClRound : public IClOperator +{ +public: + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + */ + void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClRound::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H */ diff --git a/src/gpu/cl/operators/ClFill.cpp b/src/gpu/cl/operators/ClFill.cpp new file mode 100644 index 0000000000..817b15ab20 --- /dev/null +++ b/src/gpu/cl/operators/ClFill.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClFill.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClFillKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClFill::configure(const ClCompileContext &compile_context, + ITensorInfo *tensor, + const PixelValue &constant_value, + Window *dst_window) +{ + ARM_COMPUTE_LOG_PARAMS(tensor, constant_value, dst_window); + auto k = std::make_unique<kernels::ClFillKernel>(); + k->configure(compile_context, tensor, constant_value, dst_window); + _kernel = std::move(k); +} + +Status ClFill::validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window) +{ + return kernels::ClFillKernel::validate(tensor, constant_value, dst_window); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClFill.h b/src/gpu/cl/operators/ClFill.h new file mode 100644 index 0000000000..e13862aa6b --- /dev/null +++ b/src/gpu/cl/operators/ClFill.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_FILL_H +#define ARM_COMPUTE_CL_FILL_H + +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Window.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClFillKernel */ +class ClFill : public IClOperator +{ +public: + /** Initialise the kernel's tensor and filling value + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] tensor Source tensor info. Supported data types: All. + * @param[in] constant_value The value used to fill the planes of the tensor + * @param[in] window Window to be used in case setting only part of a tensor. Default is nullptr. + */ + void configure(const CLCompileContext &compile_context, + ITensorInfo *tensor, + const PixelValue &constant_value, + Window *window = nullptr); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClFill::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_FILL_H */ diff --git a/src/gpu/cl/operators/ClFlatten.cpp b/src/gpu/cl/operators/ClFlatten.cpp new file mode 100644 index 0000000000..7532532c94 --- /dev/null +++ b/src/gpu/cl/operators/ClFlatten.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClFlatten.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClReshapeKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClFlatten::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::ClReshapeKernel>(); + k->configure(compile_context, src, dst); + _kernel = std::move(k); +} + +Status ClFlatten::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClReshapeKernel::validate(src, dst); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClFlatten.h b/src/gpu/cl/operators/ClFlatten.h new file mode 100644 index 0000000000..d2ce3b701d --- /dev/null +++ b/src/gpu/cl/operators/ClFlatten.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_FLATTEN_H +#define ARM_COMPUTE_CL_FLATTEN_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to flatten a given input */ +class ClFlatten : public IClOperator +{ +public: + /** Configure operator for a given list of arguments + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |All |All | + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor to flatten with at least 3 dimensions. + * The dimensions above the third will be interpreted as batches. Data types supported: All + * @param[in] dst Destination tensor with shape [w*h*d, input_batches] where: + * w = width input tensor, h = height input tensor and d = depth input tensor. + * Data type supported: same as @p src + */ + void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClFlatten::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_FLATTEN_H */ diff --git a/src/gpu/cl/operators/ClFloor.cpp b/src/gpu/cl/operators/ClFloor.cpp new file mode 100644 index 0000000000..6790160172 --- /dev/null +++ b/src/gpu/cl/operators/ClFloor.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClFloor.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClFloorKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClFloor::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::ClFloorKernel>(); + k->configure(compile_context, src, dst); + _kernel = std::move(k); +} + +Status ClFloor::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClFloorKernel::validate(src, dst); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClFloor.h b/src/gpu/cl/operators/ClFloor.h new file mode 100644 index 0000000000..746147335e --- /dev/null +++ b/src/gpu/cl/operators/ClFloor.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_FLOOR_H +#define ARM_COMPUTE_CL_FLOOR_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClFloorKernel */ +class ClFloor : public IClOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: F16/F32. + * @param[in] dst Destination tensor info. Data type supported: same as @p src + */ + void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClFloor::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_FLOOR_H */ diff --git a/src/gpu/cl/operators/ClFullyConnected.cpp b/src/gpu/cl/operators/ClFullyConnected.cpp new file mode 100644 index 0000000000..6969ac8ab3 --- /dev/null +++ b/src/gpu/cl/operators/ClFullyConnected.cpp @@ -0,0 +1,698 @@ +/* + * Copyright (c) 2017-2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClFullyConnected.h" + +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h" +#include "src/gpu/cl/operators/ClFlatten.h" +#include "src/gpu/cl/operators/ClGemm.h" +#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h" +#include "src/gpu/cl/operators/ClMatMul.h" +#include "src/gpu/cl/operators/ClTranspose.h" +#include "src/gpu/cl/utils/ClAuxTensorHandler.h" +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h" +#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h" +#include "support/Cast.h" + +#include <algorithm> + +namespace arm_compute +{ +namespace opencl +{ +using namespace arm_compute::experimental; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +// Function to calculate batched tensor shape in format [M, 1, B0, B1 ..] which is the format matmul expects +inline TensorShape get_reshaped_matmul_tensor(const TensorShape &src) +{ + return TensorShape(src.x(), 1, src.y(), src.collapsed_from(2).z()); // Return value optimisation +} + +Status construct_gemmlowp_output_stage(const ITensorInfo &src, + const ITensorInfo &weights, + const ITensorInfo &dst, + GEMMLowpOutputStageInfo &gemmlowp_output_stage, + ActivationLayerInfo activation_info) +{ + gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + gemmlowp_output_stage.gemmlowp_offset = 0; + gemmlowp_output_stage.gemmlowp_multiplier = 0; + gemmlowp_output_stage.gemmlowp_shift = 0; + + const auto data_type = src.data_type(); + + // Configure output stage for quantized case + if (is_data_type_quantized_asymmetric(data_type)) + { + const QuantizationInfo oq_info = dst.quantization_info(); + const UniformQuantizationInfo iq_unif = src.quantization_info().uniform(); + const UniformQuantizationInfo wq_unif = weights.quantization_info().uniform(); + const UniformQuantizationInfo oq_unif = oq_info.uniform(); + + const auto output_quant_info = (dst.total_size() == 0) ? iq_unif : oq_unif; + + const float multiplier = (iq_unif.scale * wq_unif.scale) / output_quant_info.scale; + int output_multiplier = 0; + int output_shift = 0; + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + + PixelValue type_min{}; + PixelValue type_max{}; + std::tie(type_min, type_max) = get_min_max(data_type); + + if (activation_info.enabled()) + { + std::tie(type_min, type_max) = + get_quantized_activation_min_max(activation_info, data_type, output_quant_info); + } + + // Set the GEMMLowp output stage info + gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; + gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier; + gemmlowp_output_stage.gemmlowp_shift = output_shift; + gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier); + gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift); + type_min.get(gemmlowp_output_stage.gemmlowp_min_bound); + type_max.get(gemmlowp_output_stage.gemmlowp_max_bound); + } + + return Status{}; +} + +Status validate_mm(const ITensorInfo &src, + const ITensorInfo &weights, + const ITensorInfo *bias, + const ITensorInfo &dst, + const FullyConnectedLayerInfo &fc_info, + bool use_matmul) +{ + // Note : If input is dynamic and data is not batched, use matmul, else use gemm + const bool transpose_weights = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false; + const bool use_dynamic_gemm = + !use_matmul && !weights.are_values_constant() && transpose_weights; // use dynamic gemm as fallback for matmul + const bool is_quantized = is_data_type_quantized_asymmetric(src.data_type()); + + if (use_matmul) + { + const MatMulInfo m_info = MatMulInfo().adj_rhs(transpose_weights); + + // Note: LHS is reshaped here to match ClMatMul expectations of batch index - From [M, B0, B1] to [M, 1, B0, B1] + TensorInfo lhs_to_use = src.clone()->set_tensor_shape(get_reshaped_matmul_tensor(src.tensor_shape())); + + const GPUTarget gpu_target = CLScheduler::get().target(); + std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> t = + cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target); + const MatMulKernelInfo kernel_info = t->configure(&lhs_to_use, &weights, m_info); + + return is_quantized ? kernels::ClMatMulLowpNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, + kernel_info, fc_info.activation_info) + : kernels::ClMatMulNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info, + fc_info.activation_info); + } + else + { + GEMMLowpOutputStageInfo gemmlowp_output_stage; + ARM_COMPUTE_RETURN_ON_ERROR( + construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info)); + + const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped + false, // is_b_reshaped + !use_dynamic_gemm, // reshape_b_only_on_first_run + 0, // depth_output_gemm3d + false, // reinterpret_input_as_3d + fc_info.retain_internal_weights, // retain_internal_weights + gemmlowp_output_stage, // gemmlowp_output_stage + fc_info.fp_mixed_precision, // fp_mixed_precision + false, // fast_math + true, // broadcast_bias + ActivationLayerInfo()); // activation_info + + if (is_quantized) + { + const UniformQuantizationInfo iq_info = src.quantization_info().uniform(); + const UniformQuantizationInfo wq_info = weights.quantization_info().uniform(); + + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate src and weights offset + const QuantizationInfo src_quantization_info(iq_info.scale, -iq_info.offset); + const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset); + + // Validate gemmlowp function + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate( + &src.clone()->set_quantization_info(src_quantization_info), + &weights.clone()->set_quantization_info(weights_quantization_info), bias, &dst, gemm_info)); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR(ClGemm::validate(&src, &weights, bias, &dst, 1.f, 1.f, gemm_info)); + } + } + + return Status{}; +} +} // namespace + +ClFullyConnected::ClFullyConnected() + : _convert_weights(nullptr), + _flatten(nullptr), + _reshape_weights(nullptr), + _mm_gemm(nullptr), + _mm_gemmlowp(nullptr), + _matmul_native_kernel(nullptr), + _matmul_lowp_native_kernel(nullptr), + _aux_mem(Count) +{ +} + +ClFullyConnected::~ClFullyConnected() = default; + +void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, + const FullyConnectedLayerInfo &fc_info) +{ + // If weights are dynamic and matmul is supported use matmul, else use gemm + if (_use_matmul) + { + // Specify whether transpose weights is necessary in matmul info + const MatMulInfo mat_info = MatMulInfo().adj_rhs(_transpose_weights); + + // Note: MatMul does not need offset negation unlike gemm + // 1. Change shape when calling matmul to fit batch expectations. + _lhs_to_use = src->clone()->set_tensor_shape(get_reshaped_matmul_tensor(_lhs_to_use.tensor_shape())); + + // 2. Use heuristics to get kernel info object + const GPUTarget gpu_target = CLScheduler::get().target(); + std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> kernel_config = + cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target); + MatMulKernelInfo kernel_info = kernel_config->configure(src, weights, mat_info); + + // 3. Configure relevant matmul kernel + if (_is_quantized) + { + _matmul_lowp_native_kernel = std::make_unique<kernels::ClMatMulLowpNativeKernel>(); + _matmul_lowp_native_kernel->set_target(gpu_target); + _matmul_lowp_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info, + fc_info.activation_info); + } + else + { + _matmul_native_kernel = std::make_unique<kernels::ClMatMulNativeKernel>(); + _matmul_native_kernel->set_target(gpu_target); + _matmul_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info, + fc_info.activation_info); + } + } + else + { + // Configure GEMM + GEMMLowpOutputStageInfo gemmlowp_output_stage; + construct_gemmlowp_output_stage(*src, *weights, *dst, gemmlowp_output_stage, fc_info.activation_info); + + const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped + false, // is_b_reshaped + !_dynamic_gemm, // reshape_b_only_on_first_run + 0, // depth_output_gemm3d + false, // reinterpret_input_as_3d + fc_info.retain_internal_weights, // retain_internal_weights + gemmlowp_output_stage, // gemmlowp_output_stage + fc_info.fp_mixed_precision, // fp_mixed_precision + false, // fast_math + true, // broadcast_bias + fc_info.activation_info); // activation_info + + if (_is_quantized) + { + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo src_quantization_info = src->quantization_info(); + const QuantizationInfo weights_quantization_info = weights->quantization_info(); + + TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info); + TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info); + + src_info.set_quantization_info( + QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset)); + weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, + -weights_quantization_info.uniform().offset)); + + // Configure gemmlowp function + _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>(); + _mm_gemmlowp->configure(compile_context, &src_info, &weights_info, bias, dst, gemm_info); + } + else + { + // Configure matrix multiply kernel + _mm_gemm = std::make_unique<ClGemm>(); + _mm_gemm->configure(compile_context, src, weights, bias, dst, 1.f, 1.f, gemm_info); + } + } +} + +void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, + const FullyConnectedLayerInfo &fc_info) +{ + // MatMul fuses transpose operation, so we use the first dimension for comparison where appropriate. + ARM_COMPUTE_ERROR_ON((weights->dimension((_use_matmul && _transpose_weights) ? 0 : 1) != + (src->dimension(0) * src->dimension(1) * src->dimension(2)))); + + // If the fully connected layer is called after a convolution layer, the input tensor must be linearized + + // Initialize output tensor for flatten + _flattened_src = src->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(compute_flatten_shape(src)) + .set_data_layout(DataLayout::NCHW); + + // Configure flatten kernel + _flatten = std::make_unique<ClFlatten>(); + _flatten->configure(compile_context, src, &_flattened_src); + + // Note: if flatten has > 1 dimensions after, these dimensions are batch + // Configure matrix multiply kernel + configure_mm(compile_context, &_flattened_src, weights, bias, dst, fc_info); +} + +void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, + const FullyConnectedLayerInfo &fc_info) +{ + // MatMul fuses transpose operation, so we use the first dimension for comparison where appropriate. + ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension((_use_matmul && _transpose_weights) ? 0 : 1)); + + // Configure matrix multiply kernel + configure_mm(compile_context, src, weights, bias, dst, fc_info); +} + +void ClFullyConnected::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + const GPUTarget gpu_target = get_arch_from_target(CLScheduler::get().target()); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(ClFullyConnected::validate(src, weights, biases, dst, fc_info)); + ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, fc_info); + + _transpose_weights = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false; + _is_fc_after_conv = true; + _is_quantized = is_data_type_quantized_asymmetric(src->data_type()); + _is_prepared = fc_info.retain_internal_weights; + _weights_to_use = TensorInfo(*weights); + _weights_to_use_idx = ACL_SRC_1; + + // When using dynamic weights - use matmul kernels. + // Note: MatMul is not used in the following cases (Gemm is used as fallback) : + // 1. When the weights tensor is not dynamic + // 2. MatMul does not support broadcasting batch dimension, and therefore is disabled if fc is batched. + // 3. When FC is after convolution and src tensor data layout does not match weights trained data layout (weights conversion kernel is required) + const bool is_batched_fc_layer = dst->dimension(1) > 1; + _use_matmul = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer && + !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout)); + _dynamic_gemm = !weights->are_values_constant() && _transpose_weights && !_use_matmul; + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + // Check if we have a fully connected layer with batches + if (is_batched_fc_layer) + { + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), + dst->tensor_shape().cbegin() + 1)); + } + else + { + _is_fc_after_conv = src->num_dimensions() > 1; + } + + ITensorInfo *weights_used = weights; + + // Reshape weights if needed - Not needed when matmul is in use as matmul fuses transpose op. + if (_transpose_weights && !_use_matmul) + { + // Reshape the weights + _reshape_weights = std::make_unique<ClTranspose>(); + _reshape_weights->configure(compile_context, weights, &_reshaped_weights); + weights_used = &_reshaped_weights; + _weights_to_use_idx = offset_int_vec(TransposedWeights); + } + + // Convert weights if needed + if (_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) + { + // Convert weights + _convert_weights = std::make_unique<ClConvertFullyConnectedWeights>(); + _convert_weights->configure(compile_context, weights_used, &_converted_weights, src->tensor_shape(), + fc_info.weights_trained_layout); + + weights_used = &_converted_weights; + _weights_to_use_idx = offset_int_vec(ConvertedWeights); + _run_convert_weights = true; + } + + if (_is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + configure_conv_fc(compile_context, src, weights_used, biases, dst, fc_info); + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + configure_fc_fc(compile_context, src, weights_used, biases, dst, fc_info); + } + // Update TensorInfo of final weights used (Need to be done in the end due to padding expansion) + _weights_to_use = *weights_used; + + if (_use_matmul) + { + // Note : MatMul does not use transpose and does not need auxillary memory, so only converted weights are added to aux_mem + _aux_mem[ConvertedWeights] = + MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Temporary, _converted_weights.total_size()); + } + else + { + // Set auxiliary memory requirements for gemm operators + auto gemm_mem_req = (_is_quantized) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace(); + for (unsigned int i = 0; i < gemm_mem_req.size(); ++i) + { + _aux_mem[i] = gemm_mem_req[i]; + } + if (_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs + { + // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch + // Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time + _aux_mem[TransposedWeights] = MemoryInfo( + offset_int_vec(TransposedWeights), _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare, + _reshaped_weights.total_size()); + _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), + _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare, + _converted_weights.total_size()); + } + else + { + // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch + const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights)) + ? MemoryLifetime::Persistent + : MemoryLifetime::Prepare; + const auto converted_wei_lft = (_weights_to_use_idx == offset_int_vec(ConvertedWeights)) + ? MemoryLifetime::Persistent + : MemoryLifetime::Prepare; + + _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), + _dynamic_gemm ? MemoryLifetime::Temporary : transposed_wei_lft, + _reshaped_weights.total_size()); + _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), + _dynamic_gemm ? MemoryLifetime::Temporary : converted_wei_lft, + _converted_weights.total_size()); + } + } + _aux_mem[FlattenedSrc] = + MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size()); +} + +Status ClFullyConnected::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON( + fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); + const GPUTarget gpu_target = get_arch_from_target(CLScheduler::get().target()); + + const bool transpose_weights = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false; + bool is_fc_after_conv = true; + + // When using dynamic weights - use matmul kernels. + // Note: MatMul does not support broadcasting so fallback with batched cases. + const bool is_batched_fc_layer = dst->dimension(1) > 1; + const bool use_matmul = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && + !is_batched_fc_layer && + !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout)); + + const ITensorInfo &flatten_src = TensorInfo(src->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(compute_flatten_shape(src)) + .set_data_layout(DataLayout::NCHW)); + const ITensorInfo &reshaped_weights = TensorInfo( + weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); + const ITensorInfo &converted_weights = (transpose_weights && !use_matmul) + ? TensorInfo(*reshaped_weights.clone()) + : TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()); + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensorInfo *src_to_use = src; + const ITensorInfo *weights_to_use = weights; + + if (biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); + if (is_data_type_quantized(src->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); + } + } + + // Check if FC is after conv (flatten kernel is run in case where FC is after conv.) + if (is_batched_fc_layer) + { + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), + dst->tensor_shape().cbegin() + 1)); + } + else + { + is_fc_after_conv = src->num_dimensions() > 1; + } + + // Transpose kernel does not run when matmul is supported as matmul fuses transpose op. + if (transpose_weights && !use_matmul) + { + // Validate reshape weights kernel + ARM_COMPUTE_RETURN_ON_ERROR(ClTranspose::validate(weights, &reshaped_weights)); + weights_to_use = &reshaped_weights; + } + + if (is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) + { + // Validate convert weights kernel + ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate( + weights_to_use, &converted_weights, src->tensor_shape(), fc_info.weights_trained_layout)); + weights_to_use = &converted_weights; + } + + if (is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + // K Index of matrix multiplication. MatMul performs transpose in kernel, so index is 0 when matmul and transpose enabled + const int weight_idx = (use_matmul && transpose_weights) ? 0 : 1; + ARM_COMPUTE_RETURN_ERROR_ON( + (weights_to_use->dimension(weight_idx) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); + + // Validate flatten kernel + ARM_COMPUTE_RETURN_ON_ERROR(ClFlatten::validate(src, &flatten_src)); + src_to_use = &flatten_src; + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + // K Index of matrix multiplication. MatMul performs transpose in kernel, so index is 0 when matmul and transpose enabled + const int weight_idx = (use_matmul && transpose_weights) ? 0 : 1; + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(weight_idx)); + } + + // Validate matrix multiply kernel + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*src_to_use, *weights_to_use, biases, *dst, fc_info, use_matmul)); + + return Status{}; +} + +void ClFullyConnected::run(ITensorPack &tensors) +{ + prepare(tensors); + +#ifdef ARM_COMPUTE_ASSERTS_ENABLED + ++_asrt_run_count; + ARM_COMPUTE_ERROR_ON(_dynamic_gemm && _asrt_prepare_count != _asrt_run_count); +#endif // ARM_COMPUTE_ASSERTS_ENABLED + + auto src = tensors.get_const_tensor(ACL_SRC_0); + + CLAuxTensorHandler flattened_src(offset_int_vec(FlattenedSrc), _flattened_src, tensors, false); + CLAuxTensorHandler weights(_weights_to_use_idx, _weights_to_use, tensors, false); + + // Linearize input if it comes from a convolutional layer + if (_is_fc_after_conv) + { + ITensorPack flatten_pack{{ACL_SRC, src}, {ACL_DST, flattened_src.get()}}; + _flatten->run(flatten_pack); + } + + ITensorPack gemm_pack = tensors; + gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src); + if (_weights_to_use_idx != ACL_SRC_1) + { + gemm_pack.add_const_tensor(ACL_SRC_1, weights.get()); + } + + // Run MatMul Op + if (_use_matmul) + { + // Run matmul kernels for matrix multiplication + if (_is_quantized) + { + CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, gemm_pack, true); + } + else + { + CLScheduler::get().enqueue_op(*_matmul_native_kernel, gemm_pack, true); + } + } + else + { + // Run matrix multiply + if (_is_quantized) + { + _mm_gemmlowp->run(gemm_pack); + } + else + { + _mm_gemm->run(gemm_pack); + } + } +} + +void ClFullyConnected::prepare(ITensorPack &tensors) +{ + // Note : Running prepare() each run when _use_matmul is true is unnecessary unless weights conversion is needed. + if (!_is_prepared || _dynamic_gemm) + { +#ifdef ARM_COMPUTE_ASSERTS_ENABLED + ++_asrt_prepare_count; + ARM_COMPUTE_ERROR_ON(!_dynamic_gemm && !_use_matmul && _asrt_prepare_count > 1); +#endif // ARM_COMPUTE_ASSERTS_ENABLED + + auto weights = tensors.get_const_tensor(ACL_SRC_1); + + CLAuxTensorHandler reshaped_weights(offset_int_vec(TransposedWeights), _reshaped_weights, tensors, false); + CLAuxTensorHandler converted_weights(offset_int_vec(ConvertedWeights), _converted_weights, tensors, false); + + // Pointer to current weights + const ITensor *cur_weights = weights; + + // Reshape weights if needed. Disabled when matmul kernels are enabled as matmul fuses transpose. + if (_transpose_weights && !_use_matmul) + { + // Run reshape weights kernel and mark weights as unused + ITensorPack transpose_pack{{ACL_SRC, weights}, {ACL_DST, reshaped_weights.get()}}; + _reshape_weights->run(transpose_pack); + + cur_weights->mark_as_unused(); + cur_weights = reshaped_weights.get(); + } + + // Convert weights if needed + if (_run_convert_weights) + { + ITensorPack convert_pack{{ACL_SRC, cur_weights}, {ACL_DST, converted_weights.get()}}; + _convert_weights->run(convert_pack); + + cur_weights->mark_as_unused(); + cur_weights = converted_weights.get(); + } + + ITensorPack gemm_pack = tensors; + gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights); + + // Prepare GEMM prepare and release unused weights + if (_dynamic_gemm || !_use_matmul) + { + if (!_is_quantized) + { + _mm_gemm->prepare(gemm_pack); + } + else + { + _mm_gemmlowp->prepare(gemm_pack); + } + } + + _is_prepared = true; + } +} + +experimental::MemoryRequirements ClFullyConnected::workspace() const +{ + return _aux_mem; +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClFullyConnected.h b/src/gpu/cl/operators/ClFullyConnected.h new file mode 100644 index 0000000000..72884ff7ad --- /dev/null +++ b/src/gpu/cl/operators/ClFullyConnected.h @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2017-2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_GPU_CL_OPERATORS_CLFULLYCONNECTED_H +#define ACL_SRC_GPU_CL_OPERATORS_CLFULLYCONNECTED_H + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/function_info/FullyConnectedLayerInfo.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" +#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h" +#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" + +#include <memory> + +namespace arm_compute +{ +namespace opencl +{ +// Forward declarations +class ClConvertFullyConnectedWeights; +class ClFlatten; +class ClGemm; +class ClGemmLowpMatrixMultiplyCore; +class ClTranspose; +// Kernel Forward Declarations +namespace kernels +{ +class ClMatMulNativeKernel; +class ClMatMulLowpNativeKernel; +} // namespace kernels +/** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels: + * + * -# @ref opencl::kernels::ClIm2ColKernel (called when the input comes from a convolutional layer) + * -# @ref CLTranspose (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once) + * -# @ref opencl::ClGemm or @ref CLGEMMLowpMatrixMultiplyCore (if quantized asymmetric) + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class ClFullyConnected : public IClOperator +{ +public: + ClFullyConnected(); + ~ClFullyConnected(); + /** Set the input and output tensors. + * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension. + * Data type supported: Same as @p src. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p src. + * @param[out] dst Destination tensor. Its shape should be equal to the output of a matrix multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer. + * Data type supported: Same as @p src. + * @param[in] fc_info (Optional) Fully connected layer additional info + */ + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClFullyConnected::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + + // Inherited methods overriden + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + void configure_fc_fc(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, + const FullyConnectedLayerInfo &fc_info); + void configure_conv_fc(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, + const FullyConnectedLayerInfo &fc_info); + void configure_mm(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, + const FullyConnectedLayerInfo &fc_info); + +private: + enum AuxTensorIdx + { + TransposedWeights = 10, + ConvertedWeights = 11, + FlattenedSrc = 12, + Count = 13 + }; + + std::unique_ptr<ClConvertFullyConnectedWeights> _convert_weights; + std::unique_ptr<ClFlatten> _flatten; + std::unique_ptr<ClTranspose> _reshape_weights; + std::unique_ptr<ClGemm> _mm_gemm; + std::unique_ptr<ClGemmLowpMatrixMultiplyCore> _mm_gemmlowp; + + std::unique_ptr<kernels::ClMatMulNativeKernel> _matmul_native_kernel; + std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel; + + experimental::MemoryRequirements _aux_mem{}; + + TensorInfo _flattened_src{}; + TensorInfo _converted_weights{}; + TensorInfo _reshaped_weights{}; + TensorInfo _lhs_to_use{}; + TensorInfo _weights_to_use{}; + int _weights_to_use_idx{ACL_SRC_1}; + + bool _run_convert_weights{false}; + bool _transpose_weights{false}; + bool _dynamic_gemm{false}; + bool _use_matmul{false}; + + bool _is_fc_after_conv{true}; + bool _is_quantized{false}; + bool _is_prepared{false}; + +#ifdef ARM_COMPUTE_ASSERTS_ENABLED + int _asrt_run_count{}; + int _asrt_prepare_count{}; +#endif // ARM_COMPUTE_ASSERTS_ENABLED +}; +} // namespace opencl +} // namespace arm_compute +#endif // ACL_SRC_GPU_CL_OPERATORS_CLFULLYCONNECTED_H diff --git a/src/gpu/cl/operators/ClGemm.cpp b/src/gpu/cl/operators/ClGemm.cpp new file mode 100644 index 0000000000..815c254c69 --- /dev/null +++ b/src/gpu/cl/operators/ClGemm.cpp @@ -0,0 +1,923 @@ +/* + * Copyright (c) 2017-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClGemm.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/Log.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/ITensorAllocator.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/utils/helpers/float_ops.h" +#include "src/gpu/cl/IClKernel.h" +#include "src/gpu/cl/utils/ClAuxTensorHandler.h" +#include "src/runtime/CL/gemm/CLGEMMKernelSelection.h" +#include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h" +#include "support/Cast.h" +#include "utils/TypePrinter.h" + +namespace arm_compute +{ +namespace opencl +{ +using namespace arm_compute::misc::shape_calculator; +using namespace arm_compute::cl_gemm; +using namespace arm_compute::experimental; +using namespace arm_compute::utils::cast; +using namespace arm_compute::opencl::kernels; + +namespace +{ +inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type) +{ + return kernel_type == CLGEMMKernelType::NATIVE ? false : true; +} +//Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type +inline CLGEMMKernelType +auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run, bool constant_weights) +{ + if (!constant_weights) + { + return CLGEMMKernelType::NATIVE; + } + + auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run); + if (bool(gemm_kernel)) + { + if (validate_gemm_kernel(gemm_kernel.gemm_type)) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", + to_string(gemm_kernel.gemm_type).c_str()); + return gemm_kernel.gemm_type; + } + } + gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", + to_string(gemm_kernel.gemm_type).c_str()); + return gemm_kernel.gemm_type; +} +// Validate lhs_info and rhs_info for reshaped only rhs kernel +inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + GEMMKernelInfo gemm_kernel_info) +{ + // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel + TensorInfo tmp_b_info{}; + // Validate reshape RHS kernel + auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); + if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) + { + return false; + } + // Validate mm kernel + gemm_kernel_info.lhs_info = lhs_info; + gemm_kernel_info.rhs_info = rhs_info; + gemm_kernel_info.has_pad_y = false; + if (!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, + rhs_info, gemm_kernel_info))) + { + return false; + } + gemm_kernel_info.has_pad_y = true; + if (!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, + rhs_info, gemm_kernel_info))) + { + return false; + } + return true; +} + +//Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs +inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, + GEMMKernelInfo kernel_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output) +{ + auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query); + if (config) + { + if (validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info)) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; + } + } + config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; +} + +// Validate lhs_info and rhs_info for reshaped kernel +inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + GEMMKernelInfo gemm_kernel_info, + bool reinterpret_input_as_3d) +{ + // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped kernel + TensorInfo tmp_a_info{}; + TensorInfo tmp_b_info{}; + + // Validate reshape LHS kernel + auto_init_if_empty(tmp_a_info, + a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d))); + if (!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d))) + { + return false; + } + + // Validate reshape RHS kernel + auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); + if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) + { + return false; + } + // Validate mm kernel + gemm_kernel_info.lhs_info = lhs_info; + gemm_kernel_info.rhs_info = rhs_info; + if (!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, + rhs_info, gemm_kernel_info))) + { + return false; + } + return true; +} + +//Automatically select between mlgo (prioritized) and default heuristics for reshaped kernel configs +inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query, + GEMMKernelInfo kernel_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + bool reinterpret_input_as_3d) +{ + auto config = auto_heuristics::select_mlgo_gemm_config_reshaped(query); + if (config) + { + if (validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info, + reinterpret_input_as_3d)) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; + } + } + config = auto_heuristics::select_default_gemm_config_reshaped(query); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), + to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; +} +} // namespace + +ClGemm::ClGemm() + : _reshape_lhs_kernel(std::make_unique<ClGemmReshapeLhsMatrixKernel>()), + _reshape_rhs_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()), + _mm_native_kernel(std::make_unique<ClGemmMatrixMultiplyNativeKernel>()), + _mm_reshaped_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedKernel>()), + _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()), + _mm_reshaped_only_rhs_mmul_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel>()), + _tmp_a(), + _tmp_b(), + _reshape_b_only_on_first_run(false), + _gemm_kernel_type(CLGEMMKernelType::NATIVE), + _is_prepared(false), + _aux_mem(AuxTensorIdx::Count) +{ +} + +void ClGemm::configure_native(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + DataType data_type = a->data_type(); + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const GPUTarget gpu_target = CLScheduler::get().target(); + bool broadcast_bias = gemm_info.broadcast_bias(); + + GEMMKernelInfo kernel_info; + kernel_info.m = m; + kernel_info.n = n; + kernel_info.k = k; + kernel_info.depth_output_gemm3d = depth_output_gemm3d; + kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; + kernel_info.broadcast_bias = broadcast_bias; + kernel_info.activation_info = gemm_info.activation_info(); + + // Set the target for the kernels + _mm_native_kernel->set_target(gpu_target); + + auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); + + // Configure and tune matrix multiply kernel + _mm_native_kernel->configure(compile_context, a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, + kernel_info); +} + +void ClGemm::configure_reshaped(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + DataType data_type = a->data_type(); + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const GPUTarget gpu_target = CLScheduler::get().target(); + bool broadcast_bias = gemm_info.broadcast_bias(); + + GEMMKernelInfo kernel_info; + kernel_info.m = m; + kernel_info.n = n; + kernel_info.k = k; + kernel_info.depth_output_gemm3d = depth_output_gemm3d; + kernel_info.reinterpret_input_as_3d = false; + kernel_info.broadcast_bias = broadcast_bias; + kernel_info.activation_info = gemm_info.activation_info(); + + // Set the target for the kernels + _reshape_lhs_kernel->set_target(gpu_target); + _mm_reshaped_kernel->set_target(gpu_target); + + GEMMLHSMatrixInfo lhs_info{}; + GEMMRHSMatrixInfo rhs_info{}; + + // Pick up the GEMM configuration + std::tie(lhs_info, rhs_info) = + auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}, + kernel_info, a, b, c, output, gemm_info.reinterpret_input_as_3d()); + + _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d()); + _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info); + + // Configure and tune matrix multiply kernel + _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, + kernel_info); + + // Request memory for LHS and RHS reshape matrix + _aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size()); + _aux_mem[RhsReshape] = MemoryInfo( + offset_int_vec(RhsReshape), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); +} + +void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + DataType data_type = a->data_type(); + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const GPUTarget gpu_target = CLScheduler::get().target(); + bool broadcast_bias = gemm_info.broadcast_bias(); + + GEMMKernelInfo kernel_info; + kernel_info.m = m; + kernel_info.n = n; + kernel_info.k = k; + kernel_info.depth_output_gemm3d = depth_output_gemm3d; + kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; + kernel_info.broadcast_bias = broadcast_bias; + kernel_info.activation_info = gemm_info.activation_info(); + + // Set the target for the kernels + _mm_reshaped_only_rhs_kernel->set_target(gpu_target); + + GEMMLHSMatrixInfo lhs_info{}; + GEMMRHSMatrixInfo rhs_info{}; + + // Pick up the GEMM configuration + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}, kernel_info, a, b, c, output); + + // Transpose matrix + _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info); + + // Configure two variants of CLGEMMMatrixMultiplyReshapedOnlyRHSKernel (has_pad_y = false/true) + // During the prepare stage we check the padding requirement for the lhs and dst tensors. If they do not have + // pad y, we dispatch CLGEMMMatrixMultiplyReshapedOnlyRHSKernel with has_pad_y = false + + // Configure matrix multiply kernel with no y padding support + kernel_info.has_pad_y = false; + _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, + kernel_info); + + // Request memory for RHS reshape matrix + _aux_mem[RhsReshape] = MemoryInfo( + offset_int_vec(RhsReshape), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); +} + +void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + DataType data_type = a->data_type(); + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const GPUTarget gpu_target = CLScheduler::get().target(); + bool broadcast_bias = gemm_info.broadcast_bias(); + + GEMMKernelInfo kernel_info; + kernel_info.m = m; + kernel_info.n = n; + kernel_info.k = k; + kernel_info.depth_output_gemm3d = depth_output_gemm3d; + kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; + kernel_info.broadcast_bias = broadcast_bias; + kernel_info.activation_info = gemm_info.activation_info(); + + // Set the target for the kernels + _mm_reshaped_only_rhs_mmul_kernel->set_target(gpu_target); + + GEMMLHSMatrixInfo lhs_info{}; + GEMMRHSMatrixInfo rhs_info{}; + + // Pick up the GEMM configuration + auto gemm_config = select_default_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); + lhs_info = gemm_config.lhs_info; + rhs_info = gemm_config.rhs_info; + // Force H0 to 4 in order to use the MMUL extension + rhs_info.h0 = 4; + + // Reshape Rhs matrix + _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info); + + // Configure matrix multiply kernel with no y padding support + kernel_info.has_pad_y = false; + _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, + rhs_info, kernel_info); + + // Request memory for RHS reshape matrix + _aux_mem[RhsReshape] = MemoryInfo( + offset_int_vec(RhsReshape), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); +} + +Status ClGemm::validate_native(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_UNUSED(output); + + // Get the GPU target + const GPUTarget gpu_target = CLScheduler::get().target(); + DataType data_type = a->data_type(); + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const bool broadcast_bias = gemm_info.broadcast_bias(); + + GEMMKernelInfo kernel_info; + kernel_info.m = m; + kernel_info.n = n; + kernel_info.k = k; + kernel_info.depth_output_gemm3d = depth_output_gemm3d; + kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; + kernel_info.broadcast_bias = broadcast_bias; + kernel_info.activation_info = gemm_info.activation_info(); + + auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); + + // Validate matrix multiply + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyNativeKernel::validate( + a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info)); + + return Status{}; +} + +Status ClGemm::validate_reshaped(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_UNUSED(output); + + TensorInfo tmp_a_info{}; + TensorInfo tmp_b_info{}; + + // Get the GPU target + const GPUTarget gpu_target = CLScheduler::get().target(); + DataType data_type = a->data_type(); + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const bool broadcast_bias = gemm_info.broadcast_bias(); + + GEMMKernelInfo kernel_info; + kernel_info.m = m; + kernel_info.n = n; + kernel_info.k = k; + kernel_info.depth_output_gemm3d = depth_output_gemm3d; + kernel_info.reinterpret_input_as_3d = false; + kernel_info.broadcast_bias = broadcast_bias; + kernel_info.activation_info = gemm_info.activation_info(); + + GEMMLHSMatrixInfo lhs_info; + GEMMRHSMatrixInfo rhs_info; + + // Pick up the GEMM configuration + // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails + const auto gemm_config = + select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); + lhs_info = gemm_config.lhs_info; + rhs_info = gemm_config.rhs_info; + + auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape( + compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d()))); + ARM_COMPUTE_RETURN_ON_ERROR( + ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d())); + + auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)); + + // Validate matrix multiply + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, + beta, lhs_info, rhs_info, kernel_info)); + + return Status{}; +} + +Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_UNUSED(output); + + TensorInfo tmp_b_info{}; + + // Get the GPU target + const GPUTarget gpu_target = CLScheduler::get().target(); + const DataType data_type = a->data_type(); + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const bool broadcast_bias = gemm_info.broadcast_bias(); + + GEMMKernelInfo kernel_info; + kernel_info.m = m; + kernel_info.n = n; + kernel_info.k = k; + kernel_info.depth_output_gemm3d = depth_output_gemm3d; + kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; + kernel_info.broadcast_bias = broadcast_bias; + kernel_info.activation_info = gemm_info.activation_info(); + + GEMMLHSMatrixInfo lhs_info; + GEMMRHSMatrixInfo rhs_info; + + // Pick up the GEMM configuration + // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails + const auto gemm_config = select_default_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); + lhs_info = gemm_config.lhs_info; + rhs_info = gemm_config.rhs_info; + + auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)); + + // Validate matrix multiply + kernel_info.has_pad_y = false; + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate( + a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); + + kernel_info.has_pad_y = true; + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate( + a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); + + return Status{}; +} + +Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_UNUSED(output); + TensorInfo tmp_b_info{}; + + // Get the GPU target + const GPUTarget gpu_target = CLScheduler::get().target(); + const DataType data_type = a->data_type(); + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const bool broadcast_bias = gemm_info.broadcast_bias(); + + GEMMKernelInfo kernel_info; + kernel_info.m = m; + kernel_info.n = n; + kernel_info.k = k; + kernel_info.depth_output_gemm3d = depth_output_gemm3d; + kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; + kernel_info.broadcast_bias = broadcast_bias; + kernel_info.activation_info = gemm_info.activation_info(); + + GEMMLHSMatrixInfo lhs_info; + GEMMRHSMatrixInfo rhs_info; + + // Pick up the GEMM configuration + // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails + const auto gemm_config = select_default_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); + lhs_info = gemm_config.lhs_info; + rhs_info = gemm_config.rhs_info; + // Force H0 to 4 in order to use the MMUL extension + rhs_info.h0 = 4; + + auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)); + + // Validate matrix multiply + kernel_info.has_pad_y = false; + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate( + a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); + + return Status{}; +} + +void ClGemm::configure(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate(a, b, c, output, alpha, beta, gemm_info)); + ARM_COMPUTE_LOG_PARAMS(a, b, c, output, alpha, beta, gemm_info); + + // Check if we need to reshape the matrix B only on the first run + _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); + _is_prepared = gemm_info.retain_internal_weights(); + + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + + // Select GEMMType + _gemm_kernel_type = auto_select_gemm_kernel( + auto_heuristics::CommonQuery{CLScheduler::get().target(), a->data_type(), m, n, k, batch_size}, + _reshape_b_only_on_first_run, b->are_values_constant()); + + const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr); + + ITensorInfo *c_to_use = fuse_add_c ? c : nullptr; + + switch (_gemm_kernel_type) + { + case CLGEMMKernelType::NATIVE: + { + configure_native(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info); + break; + } + case CLGEMMKernelType::RESHAPED: + { + configure_reshaped(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info); + break; + } + case CLGEMMKernelType::RESHAPED_ONLY_RHS: + { + configure_reshaped_only_rhs(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info); + break; + } + case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL: + { + configure_reshaped_only_rhs_mmul(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info); + break; + } + default: + { + ARM_COMPUTE_ERROR("GEMMType not supported"); + } + } +} + +Status ClGemm::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) +{ + // Get the GPU target + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + + // Check data type early because the auto_select_gemm_kernel has assertions on supported data types + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16); + + // Select GEMMType + CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel( + auto_heuristics::CommonQuery{ + CLScheduler::get().target(), + a->data_type(), + m, + n, + k, + batch_size, + }, + gemm_info.reshape_b_only_on_first_run(), b->are_values_constant()); + + const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr); + + const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr; + + switch (gemm_kernel_type) + { + case CLGEMMKernelType::NATIVE: + { + ARM_COMPUTE_RETURN_ON_ERROR(validate_native(a, b, c_to_use, output, alpha, beta, gemm_info)); + break; + } + case CLGEMMKernelType::RESHAPED: + { + ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped(a, b, c_to_use, output, alpha, beta, gemm_info)); + break; + } + case CLGEMMKernelType::RESHAPED_ONLY_RHS: + { + ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info)); + break; + } + case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL: + { + ARM_COMPUTE_RETURN_ON_ERROR( + validate_reshaped_only_rhs_mmul(a, b, c_to_use, output, alpha, beta, gemm_info)); + break; + } + default: + { + ARM_COMPUTE_RETURN_ERROR_MSG("GEMMType not supported"); + } + } + + return Status{}; +} + +void ClGemm::run(ITensorPack &tensors) +{ + const ITensor *lhs = tensors.get_const_tensor(ACL_SRC_0); + const ITensor *rhs = tensors.get_const_tensor(ACL_SRC_1); + ITensor *dst = tensors.get_tensor(ACL_DST); + + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, dst); + + CLAuxTensorHandler lhs_reshaped(offset_int_vec(LhsReshape), _tmp_a, tensors, true); + CLAuxTensorHandler rhs_reshaped(offset_int_vec(RhsReshape), _tmp_b, tensors, true); + + // Prepare the consts if needed + prepare(tensors); + + // Run matrix multiply kernel + switch (_gemm_kernel_type) + { + case CLGEMMKernelType::NATIVE: + { + CLScheduler::get().enqueue_op(*_mm_native_kernel, tensors, true); + break; + } + case CLGEMMKernelType::RESHAPED: + { + // Run interleave kernel + ITensorPack reshape_lhs_pack{{ACL_SRC, lhs}, {ACL_DST, lhs_reshaped.get()}}; + CLScheduler::get().enqueue_op(*_reshape_lhs_kernel, reshape_lhs_pack, false); + + if (!_reshape_b_only_on_first_run) + { + // Run transpose kernel + ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}}; + CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false); + } + // Copy original tensor pack and overwrite lhs and rhs with reshaped counterparts + ITensorPack gemm_reshaped_pack(tensors); + gemm_reshaped_pack.add_const_tensor(ACL_SRC_0, lhs_reshaped.get()); + gemm_reshaped_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get()); + + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED) + { + CLScheduler::get().enqueue_op(*_mm_reshaped_kernel, gemm_reshaped_pack, true); + } + break; + } + case CLGEMMKernelType::RESHAPED_ONLY_RHS: + { + if (!_reshape_b_only_on_first_run) + { + // Run transpose kernel + ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}}; + CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false); + } + // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement + // Check if the lhs or dst tensors have padding + const unsigned int cross_plane_pad_lhs = lhs->info()->padding().top + lhs->info()->padding().bottom; + const unsigned int cross_plane_pad_dst = dst->info()->padding().top + dst->info()->padding().bottom; + bool has_pad_y = (cross_plane_pad_lhs != 0) || (cross_plane_pad_dst != 0); + + // Copy original tensor pack and overwrite rhs with reshaped counterpart + ITensorPack gemm_reshaped_onlyrhs_pack(tensors); + gemm_reshaped_onlyrhs_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get()); + + if (has_pad_y) + { + ARM_COMPUTE_ERROR_ON(has_pad_y); + } + else + { + CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_onlyrhs_pack, true); + } + break; + } + case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL: + { + if (!_reshape_b_only_on_first_run) + { + // Run transpose kernel + ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}}; + CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false); + } + // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement + // Check if the lhs or dst tensors have padding + const unsigned int cross_plane_pad_lhs = lhs->info()->padding().top + lhs->info()->padding().bottom; + const unsigned int cross_plane_pad_dst = dst->info()->padding().top + dst->info()->padding().bottom; + bool has_pad_y = (cross_plane_pad_lhs != 0) || (cross_plane_pad_dst != 0); + + // Copy original tensor pack and overwrite rhs with reshaped counterpart + ITensorPack gemm_reshaped_onlyrhs_pack(tensors); + gemm_reshaped_onlyrhs_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get()); + + if (has_pad_y) + { + ARM_COMPUTE_ERROR_ON(has_pad_y); + } + else + { + CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_mmul_kernel, gemm_reshaped_onlyrhs_pack, true); + } + break; + } + default: + { + ARM_COMPUTE_ERROR("GEMMType not supported"); + } + } +} + +void ClGemm::prepare(ITensorPack &constants) +{ + if (!_is_prepared) + { + const ITensor *src1 = constants.get_const_tensor(ACL_SRC_1); + ICLTensor *rhs_aux = + utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape))); + + // If memory for RHS is persistent and src1 is provided re-transform else assume that RHS is transformed + if ((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && + (src1 != nullptr && rhs_aux != nullptr) && rhs_aux) + { + ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Transforming RHS Matrix!"); + + CLAuxTensorHandler rhs_reshaped(_tmp_b, *rhs_aux); + ARM_COMPUTE_ERROR_ON(rhs_reshaped.get()->cl_buffer().get() == nullptr); + + ITensorPack reshape_rhs_pack{{ACL_SRC, src1}, {ACL_DST, rhs_reshaped.get()}}; + CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, true); + } + _is_prepared = true; + } +} + +experimental::MemoryRequirements ClGemm::workspace() const +{ + return _aux_mem; +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClGemm.h b/src/gpu/cl/operators/ClGemm.h new file mode 100644 index 0000000000..85dc1d6c8f --- /dev/null +++ b/src/gpu/cl/operators/ClGemm.h @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2016-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_H +#define ARM_COMPUTE_CL_GEMM_H + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/function_info/GEMMInfo.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/CLTypes.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" +#include "src/gpu/cl/IClOperator.h" +#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h" +#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h" +#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h" +#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h" +#include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h" +#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h" + +#include <memory> + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to execute GEMM on OpenCL. This function calls the following OpenCL kernels: + * + * -# @ref kernels::ClGemmReshapeLhsMatrixKernel (only if the RESHAPED is selected by the heuristic model) + * -# @ref kernels::ClGemmReshapeRhsMatrixKernel (only if either the RESHAPED or RESHAPED_ONLY_RHS is selected by the select_gemm_kernel method()) + * -# @ref kernels::ClGemmMatrixMultiplyNativeKernel (only if NATIVE is selected by the select_gemm_kernel method()) + * -# @ref kernels::ClGemmMatrixMultiplyReshapedKernel (only if RESHAPED is selected by the select_gemm_kernel method()) + * -# @ref kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel (only if RESHAPED_ONLY_RHS is selected by the select_gemm_kernel method()) + * -# @ref kernels::ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel (only if RESHAPED_ONLY_RHS_MMUL is selected by the select_gemm_kernel method()) + */ +class ClGemm : public IClOperator +{ +public: + /** Constructor */ + ClGemm(); + /** Initialise the kernel's inputs and output + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:------------|:-----------|:---------|:--------------| + * |F32 |F32 |F32 |F32 | + * |F16 |F16 |F16 |F16 | + * + * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C]. + * + * @note All tensors must have the same data type. + * + * @note Whilst the first input tensor can be a vector, the second input tensor must be at least a matrix + * + * @note Batched GEMM only allows RHS tensor's rank to be <= 3 + * @note Batched GEMM only supports broadcasting cases where RHS rank < LHS rank but not the other way around + * + * @param[in] compile_context The compile context to be used. + * @param[in] a First input tensor (Matrix or Vector A). Data types supported: F16/F32 + * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a. + * @param[in] c Third input tensor (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a. + * @param[out] output Output tensor. Data type supported: same as @p a + * @param[in] alpha Weight of the matrix product + * @param[in] beta Weight of matrix C + * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and + * if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping + * in case matrix A and matrix B have been already transformed. + */ + void configure(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClGemm::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; + experimental::MemoryRequirements workspace() const override; + +private: + void configure_native(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + void configure_reshaped(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + void configure_reshaped_only_rhs(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + void configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + + static Status validate_native(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + static Status validate_reshaped(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + static Status validate_reshaped_only_rhs(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + static Status validate_reshaped_only_rhs_mmul(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + +private: + enum AuxTensorIdx + { + LhsReshape = 0, + RhsReshape, + Count + }; + +private: + std::unique_ptr<kernels::ClGemmReshapeLhsMatrixKernel> _reshape_lhs_kernel; + std::unique_ptr<kernels::ClGemmReshapeRhsMatrixKernel> _reshape_rhs_kernel; + std::unique_ptr<kernels::ClGemmMatrixMultiplyNativeKernel> _mm_native_kernel; + std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedKernel> _mm_reshaped_kernel; + std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_kernel; + std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel> _mm_reshaped_only_rhs_mmul_kernel; + TensorInfo _tmp_a; + TensorInfo _tmp_b; + bool _reshape_b_only_on_first_run; + CLGEMMKernelType _gemm_kernel_type; + bool _is_prepared; + experimental::MemoryRequirements _aux_mem{}; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CLGEMM_H */ diff --git a/src/gpu/cl/operators/ClGemmConv2d.cpp b/src/gpu/cl/operators/ClGemmConv2d.cpp new file mode 100644 index 0000000000..55d815a1ef --- /dev/null +++ b/src/gpu/cl/operators/ClGemmConv2d.cpp @@ -0,0 +1,668 @@ +/* + * Copyright (c) 2017-2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClGemmConv2d.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/gpu/cl/kernels/ClActivationKernel.h" +#include "src/gpu/cl/kernels/ClCol2ImKernel.h" +#include "src/gpu/cl/kernels/ClIm2ColKernel.h" +#include "src/gpu/cl/kernels/ClWeightsReshapeKernel.h" +#include "src/gpu/cl/operators/ClGemm.h" +#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h" +#include "src/gpu/cl/utils/ClAuxTensorHandler.h" +#include "support/Cast.h" + +namespace arm_compute +{ +using namespace experimental; +using namespace misc::shape_calculator; +using namespace utils::cast; +namespace opencl +{ +ClGemmConv2d::ClGemmConv2d() + : _weights_reshape_kernel(nullptr), + _im2col_kernel(nullptr), + _mm_gemm(nullptr), + _mm_gemmlowp(nullptr), + _col2im_kernel(nullptr), + _activation_kernel(nullptr), + _im2col_output(), + _weights_reshaped(), + _gemm_output(), + _skip_im2col(false), + _skip_col2im(false), + _is_quantized(false), + _fuse_activation(true), + _append_bias(false), + _is_prepared(false), + _aux_mem(AuxTensorIdx::Count) +{ +} +ClGemmConv2d::~ClGemmConv2d() = default; + +void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo &gemmlowp_output_stage, + int gemm_3d_depth, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights); + ARM_COMPUTE_ERROR_THROW_ON( + validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info)); + + const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped + false, // is_b_reshaped + true, // reshape_b_only_on_first_run + gemm_3d_depth, // depth_output_gemm3d + _skip_im2col, // reinterpret_input_as_3d + false, // retain_internal_weights + gemmlowp_output_stage, // gemmlowp_output_stage + false, // fast_math + false, // fp_mixed_precision + true, // broadcast_bias + act_info // activation_info + ); + + TensorInfo tmp_src{*src}; + if (_is_quantized) + { + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo input_quantization_info = src->quantization_info(); + const QuantizationInfo weights_quantization_info = weights->quantization_info(); + + tmp_src.set_quantization_info( + QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); + weights->set_quantization_info( + QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + + _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>(); + _mm_gemmlowp->configure(compile_context, &tmp_src, weights, biases, dst, gemm_info); + + // Revert back QuantizatioInfo as weights could be used in other convolution layers + weights->set_quantization_info(weights_quantization_info); + + auto mm_mem_req = _mm_gemmlowp->workspace(); + for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) + { + _aux_mem[cont] = mm_mem_req[cont]; + } + } + else + { + // Configure matrix multiply function + _mm_gemm = std::make_unique<ClGemm>(); + _mm_gemm->configure(compile_context, &tmp_src, weights, biases, dst, 1.0f, 1.0f, gemm_info); + auto mm_mem_req = _mm_gemm->workspace(); + for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) + { + _aux_mem[cont] = mm_mem_req[cont]; + } + } +} + +Status ClGemmConv2d::validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &gemmlowp_output_stage, + int gemm_3d_depth, + bool skip_im2col, + const ActivationLayerInfo &act_info) +{ + const bool is_quantized = is_data_type_quantized_asymmetric(src->data_type()); + + const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped + false, // is_b_reshaped + true, // reshape_b_only_on_first_run + gemm_3d_depth, // depth_output_gemm3d + skip_im2col, // reinterpret_input_as_3d + false, // retain_internal_weights + gemmlowp_output_stage, // gemmlowp_output_stage + false, // fast_math + false, // fp_mixed_precision + true, // broadcast_bias + act_info // activation_info + ); + + if (is_quantized) + { + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo input_quantization_info = src->quantization_info(); + const QuantizationInfo weights_quantization_info = weights->quantization_info(); + + std::unique_ptr<ITensorInfo> src_qa = src->clone(); + std::unique_ptr<ITensorInfo> weights_qa = weights->clone(); + src_qa->set_quantization_info( + QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); + weights_qa->set_quantization_info( + QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + + // Perform validation step on GEMMLowp + return ClGemmLowpMatrixMultiplyCore::validate(src_qa.get(), weights_qa.get(), biases, dst, gemm_info); + } + else + { + // Perform validation step on Matrix multiply function + return ClGemm::validate(src, weights, biases, dst, 1.0f, 1.0f, gemm_info); + } +} + +void ClGemmConv2d::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + + ARM_COMPUTE_ERROR_THROW_ON(ClGemmConv2d::validate(src, weights, biases, dst, conv2d_info, weights_info)); + ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv2d_info, weights_info); + + const DataType data_type = src->data_type(); + const DataLayout data_layout = src->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); + + const unsigned int kernel_width = weights->dimension(idx_width); + const unsigned int kernel_height = weights->dimension(idx_height); + const unsigned int num_kernels = weights->dimension(idx_kernels); + + const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); + + _is_prepared = weights_info.retain_internal_weights(); + _is_quantized = is_data_type_quantized_asymmetric(src->data_type()); + _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && + conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1); + _skip_col2im = data_layout == DataLayout::NHWC; + + // Only for quantize there are few cases where we cannot fuse the activation function in GEMM + _fuse_activation = true; + + const ITensorInfo *gemm_input_to_use = src; + ITensorInfo *gemm_output_to_use = dst; + + // Get parameters from conv_info + unsigned int stride_x = 0; + unsigned int stride_y = 0; + std::tie(stride_x, stride_y) = conv2d_info.conv_info.stride(); + + // Get convolved dimensions + unsigned int conv_w = 0; + unsigned int conv_h = 0; + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv2d_info.conv_info, conv2d_info.dilation); + + unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups; + + ITensorInfo *biases_to_use = biases; + _append_bias = false; + + _weights_reshape_kernel = std::make_unique<kernels::ClWeightsReshapeKernel>(); + if (conv2d_info.num_groups != 1 && biases != nullptr) + { + // num_groups != 1 can only be for NCHW + // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor + biases_to_use = nullptr; + _append_bias = true; + _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped, + conv2d_info.num_groups); + } + else + { + _weights_reshape_kernel->configure(compile_context, weights, nullptr, &_weights_reshaped, + conv2d_info.num_groups); + } + + // Create tensor to store im2col reshaped inputs + if (!_skip_im2col) + { + // Configure and tune im2col. im2col output shape is auto-initialized + _im2col_kernel = std::make_unique<opencl::kernels::ClIm2ColKernel>(); + + // Set the GPU target for im2col + _im2col_kernel->set_target(CLScheduler::get().target()); + _im2col_kernel->configure(compile_context, src, &_im2col_output, Size2D(kernel_width, kernel_height), + conv2d_info.conv_info, _append_bias, conv2d_info.dilation, conv2d_info.num_groups); + + // Set quantization info + _im2col_output.set_quantization_info(src->quantization_info()); + CLScheduler::get().tune_kernel_static(*_im2col_kernel); + + // Update GEMM input + gemm_input_to_use = &_im2col_output; + } + + // Create GEMM output tensor + if (!_skip_col2im) + { + TensorShape shape_gemm; + + // If we cannot skip col2im it means we run im2col as well + shape_gemm = _im2col_output.tensor_shape(); + shape_gemm.set(0, mat_weights_cols); + shape_gemm.set(1, conv_w * conv_h); + + _gemm_output = TensorInfo(shape_gemm, 1, data_type); + _gemm_output.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout()); + + // Update GEMM output + gemm_output_to_use = &_gemm_output; + } + + GEMMLowpOutputStageInfo gemmlowp_output_stage; + gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + gemmlowp_output_stage.gemmlowp_offset = 0; + + // Configure output stage for quantized case + if (_is_quantized) + { + const auto output_quant_info = (dst->total_size() == 0) ? iq_info : oq_info; + const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type()); + const unsigned int num_filters = (is_quantized_per_channel) ? num_kernels : 1; + + gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel; + + gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters); + gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters); + quantization::compute_quantized_multipliers_and_shifts(src, weights, dst, + gemmlowp_output_stage.gemmlowp_multipliers.data(), + gemmlowp_output_stage.gemmlowp_shifts.data()); + gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0]; + gemmlowp_output_stage.gemmlowp_shift = gemmlowp_output_stage.gemmlowp_shifts[0]; + + PixelValue min_val{}; + PixelValue max_val{}; + std::tie(min_val, max_val) = get_min_max(dst->data_type()); + + auto min_activation = min_val.get<int32_t>(); + auto max_activation = max_val.get<int32_t>(); + + const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; + + if (conv2d_info.act_info.enabled()) + { + if (supported_acts.count(conv2d_info.act_info.activation()) != 0) + { + std::tie(min_activation, max_activation) = + get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info); + } + else + { + _fuse_activation = false; + } + } + + // Set the GEMMLowp output stage info + gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; + gemmlowp_output_stage.gemmlowp_min_bound = min_activation; + gemmlowp_output_stage.gemmlowp_max_bound = max_activation; + } + + // Configure and tune GEMM + // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix + const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0; + + configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, + gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info); + + if (!_skip_col2im) + { + // Set the GPU target for col2im + _col2im_kernel = std::make_unique<opencl::kernels::ClCol2ImKernel>(); + _col2im_kernel->set_target(CLScheduler::get().target()); + // Configure and tune Col2Im + _col2im_kernel->configure(compile_context, gemm_output_to_use, dst, Size2D(conv_w, conv_h), + conv2d_info.num_groups); + CLScheduler::get().tune_kernel_static(*_col2im_kernel.get()); + } + + ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h), + "Output shape does not match the expected one"); + + if (!_fuse_activation) + { + _activation_kernel = std::make_unique<opencl::kernels::ClActivationKernel>(); + _activation_kernel->configure(compile_context, dst, nullptr, conv2d_info.act_info); + } + + _aux_mem[Im2ColOutput] = + MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size()); + _aux_mem[WeightsReshaped] = + MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Persistent, _weights_reshaped.total_size()); + _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size()); +} + +Status ClGemmConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type()); + + if (!is_quantized_per_channel) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); + } + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), + "Grouping (num_groups != 1) with NHWC data layout is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_type() == DataType::QASYMM8), + "Grouping (num_groups != 1) is not supported with QASYMM8"); + ARM_COMPUTE_RETURN_ERROR_ON(((src->dimension(2) / weights->dimension(2)) != conv2d_info.num_groups) && + (src->data_layout() == DataLayout::NCHW)); + + const DataLayout data_layout = src->data_layout(); + const DataType data_type = src->data_type(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); + + const unsigned int kernel_width = weights->dimension(idx_width); + const unsigned int kernel_height = weights->dimension(idx_height); + const unsigned int num_kernels = weights->dimension(idx_kernels); + + TensorInfo im2col_reshaped_info{}; + TensorInfo info_gemm{}; + TensorInfo weights_reshaped_info{}; + const ITensorInfo *gemm_input_to_use = src; + const ITensorInfo *gemm_output_to_use = dst; + const ITensorInfo *weights_to_use = weights; + const bool is_quantized = is_data_type_quantized_asymmetric(data_type); + const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && + conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1); + const bool skip_col2im = data_layout == DataLayout::NHWC; + bool fuse_activation = true; + + ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) != + src->dimension(idx_channel)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); + + // Validate biases + if (biases != nullptr) + { + if (is_quantized) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); + } + ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels)); + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); + } + + if (conv2d_info.act_info.enabled()) + { + ARM_COMPUTE_ERROR_ON(conv2d_info.act_info.b() > conv2d_info.act_info.a()); + } + + // Get convolved dimensions + unsigned int conv_w = 0; + unsigned int conv_h = 0; + + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv2d_info.conv_info, conv2d_info.dilation); + + unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups; + + const ITensorInfo *biases_to_use = biases; + bool append_bias = false; + + if (conv2d_info.num_groups != 1 && biases != nullptr) + { + // num_groups != 1 can only be for NCHW + // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor + biases_to_use = nullptr; + append_bias = true; + weights_reshaped_info = + TensorInfo(compute_weights_reshaped_shape(*weights, true, conv2d_info.num_groups), 1, data_type); + } + else + { + weights_reshaped_info = + TensorInfo(compute_weights_reshaped_shape(*weights, false, conv2d_info.num_groups), 1, data_type); + } + + weights_to_use = &weights_reshaped_info; + + if (!skip_im2col) + { + const Size2D kernel_dims(kernel_width, kernel_height); + + // Output tensor auto initialization if not yet initialized + TensorShape expected_output_shape = + compute_im2col_conv_shape(src, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, + conv2d_info.num_groups == 1, conv2d_info.num_groups); + + auto_init_if_empty(im2col_reshaped_info, src->clone()->set_tensor_shape(expected_output_shape)); + + ARM_COMPUTE_RETURN_ON_ERROR( + opencl::kernels::ClIm2ColKernel::validate(src, &im2col_reshaped_info, kernel_dims, conv2d_info.conv_info, + append_bias, conv2d_info.dilation, conv2d_info.num_groups)); + gemm_input_to_use = &im2col_reshaped_info; + } + + // Create GEMM output tensor + if (!skip_col2im) + { + TensorShape shape_gemm; + + shape_gemm = gemm_input_to_use->tensor_shape(); + shape_gemm.set(0, mat_weights_cols); + shape_gemm.set(1, conv_w * conv_h); + + info_gemm = TensorInfo(shape_gemm, 1, data_type); + info_gemm.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout()); + gemm_output_to_use = &info_gemm; + } + + GEMMLowpOutputStageInfo gemmlowp_output_stage; + gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + gemmlowp_output_stage.gemmlowp_offset = 0; + gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel; + + if (is_quantized) + { + const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); + const auto output_quant_info = (dst->total_size() == 0) ? iq_info : oq_info; + const unsigned int num_filters = (is_quantized_per_channel) ? num_kernels : 1; + + gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters); + gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters); + quantization::compute_quantized_multipliers_and_shifts(src, weights, dst, + gemmlowp_output_stage.gemmlowp_multipliers.data(), + gemmlowp_output_stage.gemmlowp_shifts.data()); + gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0]; + gemmlowp_output_stage.gemmlowp_shift = gemmlowp_output_stage.gemmlowp_shifts[0]; + + int min_activation = 0; + int max_activation = 0; + + const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; + + if (conv2d_info.act_info.enabled()) + { + if (supported_acts.count(conv2d_info.act_info.activation()) != 0) + { + std::tie(min_activation, max_activation) = + get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info); + } + else + { + fuse_activation = false; + } + } + + // Set the GEMMLowp output stage info + gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; + gemmlowp_output_stage.gemmlowp_min_bound = min_activation; + gemmlowp_output_stage.gemmlowp_max_bound = max_activation; + } + + // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix + const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0; + + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, + gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info)); + + // Validate Col2Im + if (!skip_col2im) + { + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups)); + } + + // Validate Activation Layer + if (!fuse_activation) + { + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, nullptr, conv2d_info.act_info)); + } + + return Status{}; +} + +void ClGemmConv2d::run(ITensorPack &tensors) +{ + prepare(tensors); + + auto src = tensors.get_const_tensor(ACL_SRC_0); + auto biases = tensors.get_const_tensor(ACL_SRC_2); + auto dst = tensors.get_tensor(ACL_DST); + auto gemm_input_to_use = src; + auto gemm_output_to_use = dst; + + CLAuxTensorHandler im2col_output(offset_int_vec(Im2ColOutput), _im2col_output, tensors, false); + CLAuxTensorHandler gemm_output(offset_int_vec(GemmOutput), _gemm_output, tensors, false); + CLAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false); + + // Run im2col + if (!_skip_im2col) + { + ITensorPack pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, im2col_output.get()}}; + CLScheduler::get().enqueue_op(*_im2col_kernel, pack, false); + gemm_input_to_use = im2col_output.get(); + } + if (!_skip_col2im) + { + gemm_output_to_use = gemm_output.get(); + } + ITensorPack pack_mm = tensors; + pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use); + pack_mm.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get()); + if (!_append_bias) + { + pack_mm.add_const_tensor(TensorType::ACL_SRC_2, biases); + } + pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use); + // Runs ClGemm or ClGemmLowpMatrixMultiplyCore functions + if (_is_quantized) + { + // Run gemmlowp + _mm_gemmlowp->run(pack_mm); + } + else + { + // Run gemm + _mm_gemm->run(pack_mm); + } + + // Reshape output matrix + if (!_skip_col2im) + { + ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}}; + CLScheduler::get().enqueue_op(*_col2im_kernel.get(), pack, false); + } + + //Run Activation Layer if we cannot fuse in GEMM + if (!_fuse_activation) + { + ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}}; + CLScheduler::get().enqueue_op(*_activation_kernel.get(), pack, false); + } +} + +void ClGemmConv2d::prepare(ITensorPack &tensors) +{ + if (!_is_prepared) + { + // Run weights reshaping and mark original weights tensor as unused + ICLTensor *weights_reshaped_p = + utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped))); + CLAuxTensorHandler weights_reshaped(_weights_reshaped, *weights_reshaped_p); + auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + ITensorPack pack = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, weights_reshaped.get()}}; + + if (_append_bias) + { + const auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2); + pack.add_const_tensor(TensorType::ACL_BIAS, biases); + } + CLScheduler::get().enqueue_op(*_weights_reshape_kernel.get(), pack, true); + tensors.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get()); + + // Prepare GEMM + _is_quantized ? _mm_gemmlowp->prepare(tensors) : _mm_gemm->prepare(tensors); + _is_prepared = true; + } +} +experimental::MemoryRequirements ClGemmConv2d::workspace() const +{ + return _aux_mem; +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClGemmConv2d.h b/src/gpu/cl/operators/ClGemmConv2d.h new file mode 100644 index 0000000000..e8f3147ac3 --- /dev/null +++ b/src/gpu/cl/operators/ClGemmConv2d.h @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H +#define ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/FunctionDescriptors.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +#include <memory> + +namespace arm_compute +{ +namespace opencl +{ +class ClGemm; +class ClGemmLowpMatrixMultiplyCore; +namespace kernels +{ +class ClIm2ColKernel; +class ClCol2ImKernel; +class ClWeightsReshapeKernel; +class ClActivationKernel; +} // namespace kernels + +/** Basic function to compute the convolution layer. This function calls the following OpenCL kernels/functions: + * + * -# @ref opencl::kernels::ClIm2ColKernel + * -# @ref ClGemm (if the data type is FP32 or FP16) + * -# @ref CLGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED) + * -# @ref ClGemmLowpOutputStage with QUANTIZE_DOWN_FIXEDPOINT type of quantization (if the data type is QASYMM8/QASYMM8_SIGNED) + * -# @ref opencl::kernels::ClCol2ImKernel (if NCHW data layout) + * -# @ref opencl::kernels::ClActivationKernel + */ +class ClGemmConv2d : public IClOperator +{ +public: + /** Constructor */ + ClGemmConv2d(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + ClGemmConv2d(const ClGemmConv2d &) = delete; + /** Default move constructor */ + ClGemmConv2d(ClGemmConv2d &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + ClGemmConv2d &operator=(const ClGemmConv2d &) = delete; + /** Default move assignment operator */ + ClGemmConv2d &operator=(ClGemmConv2d &&) = default; + /**Default destructor */ + ~ClGemmConv2d(); + /** Set the input and output tensors. + * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:--------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type. + * @param[out] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv2d_info Contains convolution 2d info described in @ref Conv2dInfo. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. If this is not part of the fully connected layer the weights + * tensor has also been transposed with CLGEMMReshapeRHSMatrixKernel. Data type supported: Same as @p input. + */ + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info = WeightsInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClGemmConvolution::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info = WeightsInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; + experimental::MemoryRequirements workspace() const override; + +private: + /** Configures the appropriate matrix multiply routine + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor info. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or + * QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type. + * @param[in, out] dst Output tensor info. Data types supported: same as @p input. + * @param[in] gemmlowp_output_stage GEMMLowp output stage info + * @param[in] gemm_3d_depth Depth of GEMM 3D + * @param[in] act_info Activation to apply after the matrix multiplication + */ + void configure_mm(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo &gemmlowp_output_stage, + int gemm_3d_depth, + const ActivationLayerInfo &act_info); + /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines + * + * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor info. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or + * QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type. + * @param[in] dst Output tensor info. Data types supported: same as @p input. + * @param[in] gemmlowp_output_stage GEMMLowp output stage info + * @param[in] gemm_3d_depth Depth of GEMM 3D + * @param[in] skip_im2col Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout. + * @param[in] act_info Activation to apply after the matrix multiplication + * + * @return a status + */ + static Status validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &gemmlowp_output_stage, + int gemm_3d_depth, + bool skip_im2col, + const ActivationLayerInfo &act_info); + + enum AuxTensorIdx + { + // ClGemmLowpMatrixMultiplyCore has up to 7 internal tensors + Im2ColOutput = 8, + WeightsReshaped, + GemmOutput, + Count + }; + + std::unique_ptr<kernels::ClWeightsReshapeKernel> _weights_reshape_kernel; + std::unique_ptr<kernels::ClIm2ColKernel> _im2col_kernel; + std::unique_ptr<ClGemm> _mm_gemm; + std::unique_ptr<ClGemmLowpMatrixMultiplyCore> _mm_gemmlowp; + std::unique_ptr<opencl::kernels::ClCol2ImKernel> _col2im_kernel; + std::unique_ptr<kernels::ClActivationKernel> _activation_kernel; + + TensorInfo _im2col_output; + TensorInfo _weights_reshaped; + TensorInfo _gemm_output; + + bool _skip_im2col; + bool _skip_col2im; + bool _is_quantized; + bool _fuse_activation; + bool _append_bias; + bool _is_prepared; + + experimental::MemoryRequirements _aux_mem; +}; +} // namespace opencl +} // namespace arm_compute +#endif // ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp new file mode 100644 index 0000000000..71c247de79 --- /dev/null +++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp @@ -0,0 +1,950 @@ +/* + * Copyright (c) 2017-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h" + +#include "arm_compute/core/Log.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/gpu/cl/kernels/ClCastKernel.h" +#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h" +#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h" +#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h" +#include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h" +#include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h" +#include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h" +#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h" +#include "src/gpu/cl/utils/ClAuxTensorHandler.h" +#include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h" + +namespace arm_compute +{ +namespace opencl +{ +using namespace arm_compute::misc::shape_calculator; +using namespace arm_compute::cl_gemm; +using namespace arm_compute::opencl::kernels; +using namespace arm_compute::experimental; + +namespace +{ +inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type) +{ + switch (kernel_type) + { + case CLGEMMKernelType::NATIVE: + case CLGEMMKernelType::RESHAPED_ONLY_RHS: + case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL: + { + return true; + } + default: + { + return false; + } + } +} + +//Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type +inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run) +{ + auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run); + if (bool(gemm_kernel)) + { + if (validate_gemm_kernel(gemm_kernel.gemm_type)) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", + to_string(gemm_kernel.gemm_type).c_str()); + return gemm_kernel.gemm_type; + } + } + gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", + to_string(gemm_kernel.gemm_type).c_str()); + return gemm_kernel.gemm_type; +} + +// Validate lhs_info and rhs_info for native kernel +inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const ITensorInfo *a, + const ITensorInfo *b, + const GEMMReshapeInfo &reshape_info) +{ + // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel + TensorInfo mm_result_s32_info{}; + // Output tensor auto initialization if not yet initialized + auto_init_if_empty( + mm_result_s32_info, + a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32)); + // Validate mm kernel + // NOTE: Ignore all other parameters (eg. output stage etc.) and only validate lhs and rhs info + // NOTE: This assumes: + // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_arguments). + // 2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_and_configure_window). + if (!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, + reshape_info))) + { + return false; + } + return true; +} + +// Automatically select between mlgo (prioritized) and default heuristics for native kernel configs +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query, + const ITensorInfo *a, + const ITensorInfo *b, + const GEMMReshapeInfo &reshape_info) +{ + auto config = auto_heuristics::select_mlgo_gemm_config_native(query); + if (config) + { + if (validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info)) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; + } + } + config = auto_heuristics::select_default_gemm_config_native(query); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; +} + +// Validate lhs_info and rhs_info for reshaped only rhs kernel +inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *output, + unsigned int m, + unsigned int n, + unsigned int k, + bool reinterpret_input_as_3d, + int depth_output_gemm3d) +{ + // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel + TensorInfo tmp_b_info{}; + // Validate reshape RHS kernel + auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); + if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) + { + return false; + } + // Validate mm kernel + // NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info + // NOTE: This assumes: + // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments). + // 2. lhs and rhs info does not cause window and padding issues through side effects (in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window). + GEMMKernelInfo gemm_kernel_info; + gemm_kernel_info.m = m; + gemm_kernel_info.n = n; + gemm_kernel_info.k = k; + gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; + gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d; + gemm_kernel_info.lhs_info = lhs_info; + gemm_kernel_info.rhs_info = rhs_info; + // Since we ignore the output stage, output data type has to be S32 to pass the validation + TensorInfo output_info_copy(*output); + output_info_copy.set_data_type(DataType::S32); + if (!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy, + gemm_kernel_info))) + { + return false; + } + return true; +} + +// Validate lhs_info and rhs_info for reshaped only rhs kernel +inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *output, + unsigned int m, + unsigned int n, + unsigned int k, + bool reinterpret_input_as_3d, + int depth_output_gemm3d) +{ + // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel + TensorInfo tmp_b_info{}; + // Validate reshape RHS kernel + auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); + if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) + { + return false; + } + // Validate mm kernel + // NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info + // NOTE: This assumes: + // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments). + // 2. lhs and rhs info does not cause window and padding issues through side effects (in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window). + GEMMKernelInfo gemm_kernel_info; + gemm_kernel_info.m = m; + gemm_kernel_info.n = n; + gemm_kernel_info.k = k; + gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; + gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d; + gemm_kernel_info.lhs_info = lhs_info; + gemm_kernel_info.rhs_info = rhs_info; + // Since we ignore the output stage, output data type has to be S32 to pass the validation + TensorInfo output_info_copy(*output); + output_info_copy.set_data_type(DataType::S32); + if (!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy, + gemm_kernel_info))) + { + return false; + } + return true; +} + +// Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, + bool reinterpret_input_as_3d, + int depth_output_gemm3d, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *output) +{ + auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query); + if (config) + { + if (validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, + query.k, reinterpret_input_as_3d, depth_output_gemm3d)) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; + } + } + config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; +} + +// Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query, + bool reinterpret_input_as_3d, + int depth_output_gemm3d, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *output) +{ + ARM_COMPUTE_UNUSED(a, b, output, reinterpret_input_as_3d, depth_output_gemm3d); + auto config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query); + validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, + query.k, reinterpret_input_as_3d, depth_output_gemm3d); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; +} + +inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type) +{ + switch (kernel_type) + { + case CLGEMMKernelType::NATIVE: + return false; + case CLGEMMKernelType::RESHAPED_ONLY_RHS: + case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL: + return true; + default: + ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!"); + } +} +} // namespace + +ClGemmLowpMatrixMultiplyCore::ClGemmLowpMatrixMultiplyCore() + : _weights_to_qasymm8(std::make_unique<ClCastKernel>()), + _mm_native_kernel(std::make_unique<ClGemmLowpMatrixMultiplyNativeKernel>()), + _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>()), + _mm_reshaped_only_rhs_mmul_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel>()), + _mtx_b_reshape_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()), + _mtx_a_reduction_kernel(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _mtx_b_reduction_kernel(std::make_unique<ClGemmLowpMatrixBReductionKernel>()), + _offset_contribution_kernel(std::make_unique<ClGemmLowpOffsetContributionKernel>()), + _offset_contribution_output_stage_kernel(std::make_unique<ClGemmLowpOffsetContributionOutputStageKernel>()), + _aux_mem(AuxTensorIdx::Count) +{ +} + +ClGemmLowpMatrixMultiplyCore::~ClGemmLowpMatrixMultiplyCore() = default; + +void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); + ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c, output, gemm_info)); + ARM_COMPUTE_LOG_PARAMS(a, b, c, output, gemm_info); + + _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); + _a_offset = a->quantization_info().uniform().offset; + _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && + is_data_type_quantized_symmetric(b->data_type()) && a->data_type() == DataType::QASYMM8; + _b_offset = _convert_to_qasymm8 ? -128 : b->quantization_info().uniform().offset; + _gemm_info = gemm_info; + + // Get the GPU target + const GPUTarget gpu_target = CLScheduler::get().target(); + + // Set the target for the kernels + _mm_native_kernel->set_target(gpu_target); + _mm_reshaped_only_rhs_kernel->set_target(gpu_target); + _mm_reshaped_only_rhs_mmul_kernel->set_target(gpu_target); + + GEMMRHSMatrixInfo rhs_info; + GEMMLHSMatrixInfo lhs_info; + + // Arguments used by GEMMReshapeInfo + // in order to know how the matrices have been reshaped + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + + const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d); + + _gemm_kernel_type = auto_select_gemm_kernel( + auto_heuristics::CommonQuery{gpu_target, a->data_type(), m, n, k, batch_size}, _reshape_b_only_on_first_run); + + if (_convert_to_qasymm8) + { + // Set data type for converted weights + _qasymm8_weights = *b; + _qasymm8_weights.set_data_type(DataType::QASYMM8); + _weights_to_qasymm8->configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP); + } + + ITensorInfo *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b; + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) + { + matrix_b = &_tmp_b; + + // Pick up the GEMM configuration + // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, reinterpret_input_as_3d, + depth_output_gemm3d, a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output); + + // Configure reshape RHS kernel + _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, + rhs_info); + } + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) + { + matrix_b = &_tmp_b; + + // Pick up the GEMM configuration + // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, reinterpret_input_as_3d, + depth_output_gemm3d, a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output); + + // Configure reshape RHS kernel + _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, + rhs_info); + } + + // Using default reduction info + const GEMMLowpReductionKernelInfo reduction_info{}; + + // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 + if (_a_offset != 0) + { + _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); + + // Configure Matrix B reduction kernel + _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, + &_vector_sum_col, reduction_info); + } + + // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 + if (_b_offset != 0) + { + _vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); + + // Configure matrix A reduction kernel + _mtx_a_reduction_kernel->configure(compile_context, a, &_vector_sum_row, reduction_info); + } + + GEMMKernelInfo gemm_kernel_info; + gemm_kernel_info.m = m; + gemm_kernel_info.n = n; + gemm_kernel_info.k = k; + gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d; + gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; + gemm_kernel_info.lhs_info = lhs_info; + gemm_kernel_info.rhs_info = rhs_info; + gemm_kernel_info.a_offset = _a_offset; + gemm_kernel_info.b_offset = _b_offset; + // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage + if (gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) + { + // Configure offset contribution kernel + const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) + ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() + : 1; + + _gemm_output_stage_multipliers = TensorInfo(TensorShape(num_filters), 1, DataType::S32); + _gemm_output_stage_shifts = TensorInfo(TensorShape(num_filters), 1, DataType::S32); + + GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage(); + gemmlowp_output_stage.output_data_type = a->data_type(); + if (num_filters == 1) + { + // Per-channel quantization with OFM == 1 is equivalent to uniform quantization. + // Setting this flag to false prevents the kernel from adding useless padding to the output multipliers and shifts + gemmlowp_output_stage.is_quantized_per_channel = false; + } + + gemm_kernel_info.output_stage = gemmlowp_output_stage; + + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS && + gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + // Configure and tune matrix multiply kernel with fused output stage + _mm_reshaped_only_rhs_kernel->configure( + compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, + &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); + } + else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL && + gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + // Configure and tune matrix multiply kernel with fused output stage + _mm_reshaped_only_rhs_mmul_kernel->configure( + compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, + &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); + } + else + { + _run_output_stage = true; + + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) + { + _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, + gemm_kernel_info); + } + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) + { + _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, + gemm_kernel_info); + } + else + { + // Pick up the GEMM configuration + // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, a, + _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info); + + // Configure matrix multiply kernel + _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, + reshape_info); + + _offset_contribution_output_stage_kernel->configure( + compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, output, a->dimension(0), + _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, + &_gemm_output_stage_shifts); + } + } + } + else + { + _run_offset_contribution = true; + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) + { + // Configure and tune matrix multiply kernel + _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info); + } + else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) + { + // Configure and tune matrix multiply kernel + _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info); + } + else + { + // Pick up the GEMM configuration + // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, a, + _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info); + + // Configure matrix multiply kernel + _mm_native_kernel->configure(compile_context, a, matrix_b, output, lhs_info, rhs_info, reshape_info); + } + + // Configure offset contribution kernel + _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, + a->dimension(0), _a_offset, _b_offset); + } + + // Request memory + _aux_mem[RhsQAsymm8] = + MemoryInfo(offset_int_vec(RhsQAsymm8), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, + _qasymm8_weights.total_size()); + if (is_gemm_reshaped(_gemm_kernel_type)) + { + // Overwrite Rhs as prepare if gemm is reshaped as there will be a two-step transformation + _aux_mem[RhsQAsymm8] = + MemoryInfo(offset_int_vec(RhsQAsymm8), + _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary, + _qasymm8_weights.total_size()); + _aux_mem[RhsReshape] = MemoryInfo( + offset_int_vec(RhsReshape), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); + } + if (_a_offset != 0) + { + _aux_mem[VecSumCol] = + MemoryInfo(offset_int_vec(VecSumCol), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, + _vector_sum_col.total_size()); + } + if (_b_offset != 0) + { + _aux_mem[VecSumRow] = + MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size()); + } + _aux_mem[ResultS32] = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size()); + _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent, + _gemm_output_stage_multipliers.total_size()); + _aux_mem[Shifts] = + MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size()); +} + +Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); + + int32_t a_offset = a->quantization_info().uniform().offset; + int32_t b_offset = b->quantization_info().uniform().offset; + + const ITensorInfo *matrix_a_info = a; + + TensorInfo tmp_b_info{}; + GEMMRHSMatrixInfo rhs_info; + GEMMLHSMatrixInfo lhs_info; + + // Get the GPU target + const GPUTarget gpu_target = CLScheduler::get().target(); + + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + + bool reshape_matrix_b = is_gemm_reshaped( + auto_select_gemm_kernel(auto_heuristics::CommonQuery{gpu_target, a->data_type(), m, n, k, batch_size}, + gemm_info.reshape_b_only_on_first_run())); + + const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d); + + bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && + is_data_type_quantized_symmetric(b->data_type()) && + is_data_type_quantized_asymmetric(a->data_type()); + TensorInfo weights_info(*b); + if (convert_to_qasymm8) + { + b_offset = -128; + weights_info.set_data_type(DataType::QASYMM8); + ARM_COMPUTE_RETURN_ON_ERROR(ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP)); + } + const ITensorInfo *matrix_b_info = &weights_info; + if (reshape_matrix_b) + { + matrix_b_info = &tmp_b_info; + + // Pick up the GEMM configuration + // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails + // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration + const auto res = select_default_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}); + lhs_info = res.lhs_info; + rhs_info = res.rhs_info; + + // Validate reshape RHS kernel + auto_init_if_empty(tmp_b_info, + weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info)); + } + + TensorInfo info_vector_sum_col{}; + TensorInfo info_vector_sum_row{}; + + const GEMMLowpReductionKernelInfo reduction_info; + // Validate matrix B reduction kernel only if _a_offset is not equal to 0 + if (a_offset != 0) + { + info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32); + + // Configure Matrix B reduction kernel + ARM_COMPUTE_RETURN_ON_ERROR( + ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info)); + } + + // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 + if (b_offset != 0) + { + info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); + + // Configure matrix A reduction kernel + ARM_COMPUTE_RETURN_ON_ERROR( + ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info)); + } + + GEMMKernelInfo gemm_kernel_info; + gemm_kernel_info.m = m; + gemm_kernel_info.n = n; + gemm_kernel_info.k = k; + gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d; + gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; + gemm_kernel_info.lhs_info = lhs_info; + gemm_kernel_info.rhs_info = rhs_info; + gemm_kernel_info.a_offset = a_offset; + gemm_kernel_info.b_offset = b_offset; + if (gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) + { + const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) + ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() + : 1; + + const TensorInfo gemm_output_stage_multipliers_shifts_info( + TensorInfo(TensorShape(num_filters), 1, DataType::S32)); + + GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage(); + gemmlowp_output_stage.output_data_type = a->data_type(); + + gemm_kernel_info.output_stage = gemmlowp_output_stage; + if (reshape_matrix_b && + gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate( + matrix_a_info, matrix_b_info, output, gemm_kernel_info, a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, c, &gemm_output_stage_multipliers_shifts_info, + &gemm_output_stage_multipliers_shifts_info)); + } + else + { + TensorInfo mm_result_s32_info{}; + + if (reshape_matrix_b) + { + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(mm_result_s32_info, a->clone() + ->set_tensor_shape(compute_mm_shape( + *matrix_a_info, *matrix_b_info, reshape_info)) + .set_data_type(DataType::S32)); + + // Validate matrix multiply + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate( + matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info)); + } + else + { + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(mm_result_s32_info, a->clone() + ->set_tensor_shape(compute_mm_shape( + *matrix_a_info, *matrix_b_info, false, reshape_info)) + .set_data_type(DataType::S32)); + + // Pick up the GEMM configuration + // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails + // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration + const auto res = select_default_gemm_config_native( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}); + lhs_info = res.lhs_info; + rhs_info = res.rhs_info; + + // Validate matrix multiply + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate( + matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)); + } + + // Validate offset contribution kernel + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate( + &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset, gemmlowp_output_stage, + &gemm_output_stage_multipliers_shifts_info, &gemm_output_stage_multipliers_shifts_info)); + } + } + else + { + if (reshape_matrix_b) + { + // Validate matrix multiply + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate( + matrix_a_info, matrix_b_info, output, gemm_kernel_info)); + } + else + { + // Pick up the GEMM configuration + // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration + const auto res = select_default_gemm_config_native( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}); + lhs_info = res.lhs_info; + rhs_info = res.rhs_info; + + // Validate matrix multiply + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate( + matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info)); + } + + if (output->total_size() != 0) + { + // Validate offset contribution kernel + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate( + output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row, + c, a_offset, b_offset)); + } + } + + return Status{}; +} + +void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) +{ + const ITensor *a = tensors.get_const_tensor(ACL_SRC_0); + const ITensor *b = tensors.get_const_tensor(ACL_SRC_1); + const ITensor *c = tensors.get_const_tensor(ACL_SRC_2); + ITensor *dst = tensors.get_tensor(ACL_DST); + + ARM_COMPUTE_ERROR_ON_NULLPTR(a, dst); + + CLAuxTensorHandler vec_sum_col(offset_int_vec(VecSumCol), _vector_sum_col, tensors, true); + CLAuxTensorHandler vec_sum_row(offset_int_vec(VecSumRow), _vector_sum_row, tensors, true); + CLAuxTensorHandler rhs_qasymm8(offset_int_vec(RhsQAsymm8), _qasymm8_weights, tensors, true); + CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true); + CLAuxTensorHandler res32(offset_int_vec(ResultS32), _mm_result_s32, tensors, true); + CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, true); + CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, true); + + // Prepare the consts if needed + prepare(tensors); + + const ITensor *matrix_a = a; + const ITensor *matrix_b = _convert_to_qasymm8 ? rhs_qasymm8.get() : b; + + if (is_gemm_reshaped(_gemm_kernel_type)) + { + matrix_b = tmp_b.get(); + if (!_reshape_b_only_on_first_run) + { + // Run reshape matrix B + ITensorPack mtx_b_reshape_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b}, + {TensorType::ACL_DST, tmp_b.get()}}; + CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_reshape_pack, false); + } + } + + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if (_a_offset != 0 && !_reshape_b_only_on_first_run) + { + ITensorPack mtx_b_red_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b}, + {TensorType::ACL_DST, vec_sum_col.get()}}; + CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false); + } + + // Run matrix A reduction kernel only if _b_offset is not equal to 0 + if (_b_offset != 0) + { + ITensorPack mtx_a_red_pack = {{TensorType::ACL_SRC, matrix_a}, {TensorType::ACL_DST, vec_sum_row.get()}}; + CLScheduler::get().enqueue_op(*_mtx_a_reduction_kernel, mtx_a_red_pack, false); + } + + // Run matrix multiply + if (is_gemm_reshaped(_gemm_kernel_type)) + { + ITensorPack gemm_reshaped_pack; + if (_run_offset_contribution) + { + gemm_reshaped_pack = ITensorPack({{TensorType::ACL_SRC_0, matrix_a}, + {TensorType::ACL_SRC_1, matrix_b}, + {TensorType::ACL_DST, _run_output_stage ? res32.get() : dst}}); + } + else + { + gemm_reshaped_pack = ITensorPack({ + {TensorType::ACL_SRC, matrix_a}, + {TensorType::ACL_SRC_1, matrix_b}, + {TensorType::ACL_BIAS, c}, + {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()}, + {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()}, + {TensorType::ACL_SHIFTS, shifts.get()}, + {TensorType::ACL_MULTIPLIERS, multipliers.get()}, + {TensorType::ACL_DST, dst}, + }); + } + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) + { + CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false); + } + else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) + { + CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_mmul_kernel, gemm_reshaped_pack, false); + } + else + { + ARM_COMPUTE_ERROR("Invalid reshaped kernel"); + } + } + else + { + ITensorPack gemm_native_pack = {{TensorType::ACL_SRC_0, matrix_a}, + {TensorType::ACL_SRC_1, matrix_b}, + {TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get()}}; + CLScheduler::get().enqueue_op(*_mm_native_kernel, gemm_native_pack, false); + } + if (_run_output_stage) + { + // Run offset contribution/output stage kernel + ITensorPack output_stage_pack = { + {TensorType::ACL_SRC, res32.get()}, + {TensorType::ACL_BIAS, c}, + {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()}, + {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()}, + {TensorType::ACL_SHIFTS, shifts.get()}, + {TensorType::ACL_MULTIPLIERS, multipliers.get()}, + {TensorType::ACL_DST, dst}, + }; + CLScheduler::get().enqueue_op(*_offset_contribution_output_stage_kernel, output_stage_pack, true); + } + if (_run_offset_contribution) + { + // Run offset contribution kernel + ITensorPack offset_contrib_pack = {{TensorType::ACL_SRC_DST, dst}, + {TensorType::ACL_BIAS, c}, + {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()}, + {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()}}; + CLScheduler::get().enqueue_op(*_offset_contribution_kernel, offset_contrib_pack, true); + } +} + +void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors) +{ + if (!_is_prepared) + { + auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); + CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true); + CLAuxTensorHandler vec_sum_col(offset_int_vec(VecSumCol), _vector_sum_col, tensors, true); + CLAuxTensorHandler rhs_qasymm8(offset_int_vec(RhsQAsymm8), _qasymm8_weights, tensors, false); + + ARM_COMPUTE_ERROR_ON_NULLPTR(b); + + if (_convert_to_qasymm8) + { + ITensorPack convert_to_qs8_pack = {{ACL_SRC, b}, {ACL_DST, rhs_qasymm8.get()}}; + CLScheduler::get().enqueue_op(*_weights_to_qasymm8, convert_to_qs8_pack, false); + b->mark_as_unused(); + } + + if (is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run) + { + // Run reshape kernel and mark original weights tensor as unused + ITensorPack mtx_b_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b}, + {TensorType::ACL_DST, tmp_b.get()}}; + CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_pack, false); + b->mark_as_unused(); + } + + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if (_a_offset != 0 && _reshape_b_only_on_first_run) + { + ITensorPack mtx_b_red_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b}, + {TensorType::ACL_DST, vec_sum_col.get()}}; + CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false); + } + + // Compute GEMM output multipliers and shifts for output stage + { + const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel) + ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() + : 1; + + CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, false); + CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, false); + + ICLTensor *multiplier_tensor = multipliers.get(); + if (multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0) + { + multiplier_tensor->map(CLScheduler::get().queue(), true); + std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)), + _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), + num_filters * sizeof(int32_t)); + multiplier_tensor->unmap(CLScheduler::get().queue()); + } + + ICLTensor *shifts_tensor = shifts.get(); + if (shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0) + { + shifts_tensor->map(CLScheduler::get().queue(), true); + std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)), + _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t)); + shifts_tensor->unmap(CLScheduler::get().queue()); + } + } + CLScheduler::get().queue().finish(); + _is_prepared = true; + } +} + +experimental::MemoryRequirements ClGemmLowpMatrixMultiplyCore::workspace() const +{ + return _aux_mem; +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h new file mode 100644 index 0000000000..c80dc3a182 --- /dev/null +++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2017-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_CORE_H +#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_CORE_H + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/function_info/GEMMInfo.h" +#include "arm_compute/runtime/CL/CLTypes.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +// Forward declarations +class ClCastKernel; +class ClGemmLowpMatrixMultiplyNativeKernel; +class ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel; +class ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel; +class ClGemmReshapeRhsMatrixKernel; +class ClGemmLowpMatrixAReductionKernel; +class ClGemmLowpMatrixBReductionKernel; +class ClGemmLowpOffsetContributionKernel; +class ClGemmLowpOffsetContributionOutputStageKernel; +} // namespace kernels + +/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. */ +class ClGemmLowpMatrixMultiplyCore : public IClOperator +{ +public: + ClGemmLowpMatrixMultiplyCore(); + ~ClGemmLowpMatrixMultiplyCore(); + /** Initialise the kernel's inputs, output + * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:--------|:--------------| + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | + * |QASYMM8 |QSYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QASYMM8 |S32 |S32 | + * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |S32 | + * |QASYMM8 |QSYMM8 |S32 |S32 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QSYMM8 |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |S32 | + * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |S32 | + * |QASYMM8_SIGNED |QSYMM8 |S32 |S32 | + * + * @note GEMMLowp: low precision GEMM kernel. [A * B + C] + * This kernel performs the following computations: + * + * -# Convert a values from 8-bit quantized to int32 and add a_offset to each of them. + * -# Convert b values from 8-bit quantized to int32 and add b_offset to each of them. + * -# Compute the matrix product of the resulting a * b in int32. + * -# Quantize to uint8 if gemm_info.gemmlowp_output_stage != NONE + * + * @param[in] compile_context The compile context to be used. + * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED. + * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a + * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported: S32 + * @param[out] output Output tensor. Data type supported: S32 or QASYMM8/QASYMM8_SIGNED if gemm_info.gemmlowp_output_stage != NONE + * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and + * if the reshape of matrix B should be executed only for the first run + */ + void configure(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + const GEMMInfo &gemm_info = GEMMInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClGemmLowpMatrixMultiplyCore::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info = GEMMInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; + experimental::MemoryRequirements workspace() const override; + +private: + enum AuxTensorIdx + { + ResultS32 = 0, + RhsQAsymm8, + RhsReshape, + VecSumCol, + VecSumRow, + Multipliers, + Shifts, + Count + }; + +private: + // Kernels used + std::unique_ptr<kernels::ClCastKernel> _weights_to_qasymm8; + std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyNativeKernel> _mm_native_kernel; + std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_kernel; + std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel> _mm_reshaped_only_rhs_mmul_kernel; + std::unique_ptr<kernels::ClGemmReshapeRhsMatrixKernel> _mtx_b_reshape_kernel; + std::unique_ptr<kernels::ClGemmLowpMatrixAReductionKernel> _mtx_a_reduction_kernel; + std::unique_ptr<kernels::ClGemmLowpMatrixBReductionKernel> _mtx_b_reduction_kernel; + std::unique_ptr<kernels::ClGemmLowpOffsetContributionKernel> _offset_contribution_kernel; + std::unique_ptr<kernels::ClGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel; + + // Temporary tensors + TensorInfo _qasymm8_weights{}; + TensorInfo _vector_sum_col{}; + TensorInfo _vector_sum_row{}; + TensorInfo _tmp_b{}; + TensorInfo _mm_result_s32{}; + TensorInfo _gemm_output_stage_multipliers{}; + TensorInfo _gemm_output_stage_shifts{}; + + int32_t _a_offset{0}; + int32_t _b_offset{0}; + bool _reshape_b_only_on_first_run{false}; + bool _run_output_stage{false}; + bool _convert_to_qasymm8{false}; + bool _run_offset_contribution{false}; + bool _is_prepared{false}; + GEMMInfo _gemm_info{}; + CLGEMMKernelType _gemm_kernel_type{}; + + experimental::MemoryRequirements _aux_mem{}; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_CORE_H */ diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp new file mode 100644 index 0000000000..e3363e3685 --- /dev/null +++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClGemmLowpOutputStage.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h" +#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h" +#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClGemmLowpOutputStage::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_LOG_PARAMS(src, bias, dst, info); + + switch (info.type) + { + case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: + { + auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel>(); + k->configure(compile_context, src, bias, dst, &info); + _kernel = std::move(k); + break; + } + case GEMMLowpOutputStageType::QUANTIZE_DOWN: + { + auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel>(); + k->configure(compile_context, src, bias, dst, &info); + _kernel = std::move(k); + break; + } + case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT: + { + auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel>(); + k->configure(compile_context, src, bias, dst, &info); + _kernel = std::move(k); + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type."); + } +} + +Status ClGemmLowpOutputStage::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM16); + + switch (info.type) + { + case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: + return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(src, bias, dst, &info); + case GEMMLowpOutputStageType::QUANTIZE_DOWN: + return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel::validate(src, bias, dst, &info); + case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT: + return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(src, bias, dst, &info); + default: + return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type."); + } +} + +void ClGemmLowpOutputStage::run(ITensorPack &tensors) +{ + const ITensor *src = tensors.get_const_tensor(ACL_SRC); + const ITensor *bias = tensors.get_const_tensor(ACL_BIAS); + ITensor *dst = tensors.get_tensor(ACL_DST); + + ITensorPack pack{{ACL_SRC, src}, {ACL_BIAS, bias}, {ACL_DST, dst}}; + CLScheduler::get().enqueue_op(*_kernel, pack, true); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.h b/src/gpu/cl/operators/ClGemmLowpOutputStage.h new file mode 100644 index 0000000000..6357e0200b --- /dev/null +++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_OUTPUT_STAGE_H +#define ARM_COMPUTE_CL_GEMMLOWP_OUTPUT_STAGE_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +/** This file contains all available output stages for GEMMLowp on OpenCL. + * + * In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyCore), + * and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. + * + * More information about the GEMMLowp output stage can be found at https://github.com/google/gemmlowp/blob/master/doc/output.md + */ + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to execute GEMMLowpQuantizeDown kernels on CL. + * + * This function calls the following CL kernels: + * + * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel + * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel + * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel +*/ +class ClGemmLowpOutputStage : public IClOperator +{ +public: + /** Constructor */ + ClGemmLowpOutputStage() = default; + /** Initialise the kernel's inputs, output + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |dst | + * |:--------------|:-------------|:-------------| + * |S32 |S32 |QASYMM8 | + * |S32 |S32 |QASYMM8_SIGNED| + * |S32 |S32 |QSYMM16 | + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src. + * @param[out] dst Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[in] info GEMMLowp output stage metadata. + */ + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClGemmLowpOutputStage::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_OUTPUT_STAGE_H */ diff --git a/src/gpu/cl/operators/ClIndirectConv2d.cpp b/src/gpu/cl/operators/ClIndirectConv2d.cpp new file mode 100644 index 0000000000..777fc9e5e1 --- /dev/null +++ b/src/gpu/cl/operators/ClIndirectConv2d.cpp @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClIndirectConv2d.h" + +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h" +#include "src/gpu/cl/kernels/ClIndirectConv2dKernel.h" +#include "src/gpu/cl/utils/ClAuxTensorHandler.h" +#include "src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h" +#include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h" + +using namespace arm_compute::cl_indirect_conv; + +namespace arm_compute +{ +namespace opencl +{ +using namespace arm_compute::experimental; + +namespace +{ +DirectConvComputeKernelInfo +config_indirect_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info) +{ + // Get GPU target + GPUTarget gpu_target = CLScheduler::get().target(); + + std::unique_ptr<IClIndirectConvKernelConfig> t = ClIndirectConvKernelConfigurationFactory::create(gpu_target); + + return t->configure(src, weights, conv_info); +} + +} // namespace + +void ClIndirectConv2d::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info); + + // Reuse the direct convolution descriptor + const DirectConvComputeKernelInfo desc = config_indirect_convolution_nhwc(src, weights, conv_info); + + // Configure indirect convolution kernels + auto k0 = std::make_unique<kernels::ClIndirectConv2dAddressPrecalculationKernel>(); + auto k1 = std::make_unique<kernels::ClIndirectConv2dKernel>(); + + k0->set_target(CLScheduler::get().target()); + k1->set_target(CLScheduler::get().target()); + + k0->configure(compile_context, src, weights, &_indirect_buffer, conv_info, desc); + k1->configure(compile_context, src, weights, biases, &_indirect_buffer, dst, conv_info, act_info, desc); + + _addr_precalculation_kernel = std::move(k0); + _indirect_conv_kernel = std::move(k1); + _is_prepared = false; + + // Tune kernels + CLScheduler::get().tune_kernel_static(*_indirect_conv_kernel); + + // Request memory for the indirect buffer + _aux_mem[IndirectBuffer] = + MemoryInfo(offset_int_vec(IndirectBuffer), MemoryLifetime::Persistent, _indirect_buffer.total_size()); +} + +Status ClIndirectConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) +{ + // Initialize the direct convolution descriptor + const DirectConvComputeKernelInfo desc = config_indirect_convolution_nhwc(src, weights, conv_info); + + TensorShape ind_buffer_shape = misc::shape_calculator::compute_indirect_buffer_shape( + src->tensor_shape(), src->data_layout(), weights->tensor_shape(), conv_info, desc); + + TensorInfo indirect_buffer(ind_buffer_shape, 1, DataType::S32); + + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dAddressPrecalculationKernel::validate( + src, weights, &indirect_buffer, conv_info, desc)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dKernel::validate(src, weights, biases, &indirect_buffer, dst, + conv_info, act_info, desc)); + + return Status{}; +} + +void ClIndirectConv2d::run(ITensorPack &tensors) +{ + CLAuxTensorHandler indirect_buffer(offset_int_vec(IndirectBuffer), _indirect_buffer, tensors, true); + + prepare(tensors); + + ITensorPack indirect_conv2d_pack(tensors); + indirect_conv2d_pack.add_const_tensor(ACL_SRC_3, indirect_buffer.get()); + + // Run indirect convolution + CLScheduler::get().enqueue_op(*_indirect_conv_kernel, indirect_conv2d_pack, true); +} + +void ClIndirectConv2d::prepare(ITensorPack &constants) +{ + if (!_is_prepared) + { + ICLTensor *indirect_buffer_aux = + utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(IndirectBuffer))); + ARM_COMPUTE_ERROR_ON(indirect_buffer_aux == nullptr); + + ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Preparing indirect buffer"); + + CLAuxTensorHandler indirect_buffer(_indirect_buffer, *indirect_buffer_aux); + ARM_COMPUTE_ERROR_ON(indirect_buffer.get()->cl_buffer().get() == nullptr); + + ITensorPack indirect_buffer_pack{{ACL_DST, indirect_buffer.get()}}; + CLScheduler::get().enqueue_op(*_addr_precalculation_kernel, indirect_buffer_pack, true); + + _is_prepared = true; + } +} + +experimental::MemoryRequirements ClIndirectConv2d::workspace() const +{ + return _aux_mem; +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClIndirectConv2d.h b/src/gpu/cl/operators/ClIndirectConv2d.h new file mode 100644 index 0000000000..29e796efd9 --- /dev/null +++ b/src/gpu/cl/operators/ClIndirectConv2d.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_INDIRECT_CONV2D_H +#define ARM_COMPUTE_CL_INDIRECT_CONV2D_H + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/CLTypes.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" +#include "src/gpu/cl/IClOperator.h" + +#include <memory> + +namespace arm_compute +{ +// Forward declaration +struct DirectConvComputeKernelInfo; + +namespace opencl +{ +/** Basic function to execute indirect convolution on OpenCL. This function calls the following OpenCL kernels: + * + * -# @ref kernels::ClIndirectConv2dAddressPrecalculationKernel + * -# @ref kernels::ClIndirectConv2dKernel + */ +class ClIndirectConv2d : public IClOperator +{ +public: + ClIndirectConv2d() = default; + /** Initialise the kernel's inputs and output + * + * Valid data layouts: + * - NHWC + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:------------|:-----------|:---------|:--------------| + * |F32 |F32 |F32 |F32 | + * |F16 |F16 |F16 |F16 | + * + * @note All tensors must have the same data type. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor. 3 lower dimensions represent a single src, + * while every optional dimension from 4 and above represent a batch of sources. + * Data types supported: F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions. Data type supported:Same as @p src. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p src data type. + * @param[out] dst Destination tensor. 3 lower dimensions represent a single dst, while the rest represent batch of destinations. + * Data types supported: Same as @p src. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * + */ + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClIndirectConv2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; + experimental::MemoryRequirements workspace() const override; + +private: + enum AuxTensorIdx + { + IndirectBuffer = 0, + Count + }; + + std::unique_ptr<IClKernel> _indirect_conv_kernel{nullptr}; + std::unique_ptr<IClKernel> _addr_precalculation_kernel{nullptr}; + TensorInfo _indirect_buffer{}; + bool _is_prepared{false}; + experimental::MemoryRequirements _aux_mem{Count}; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_INDIRECT_CONV2D_H */ diff --git a/src/gpu/cl/operators/ClLogicalNot.cpp b/src/gpu/cl/operators/ClLogicalNot.cpp new file mode 100644 index 0000000000..d8d4186d00 --- /dev/null +++ b/src/gpu/cl/operators/ClLogicalNot.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClLogicalNot.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClLogicalNot::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); + k->configure(compile_context, src, dst, ElementWiseUnary::LOGICAL_NOT); + _kernel = std::move(k); +} + +Status ClLogicalNot::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::LOGICAL_NOT); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClLogicalNot.h b/src/gpu/cl/operators/ClLogicalNot.h new file mode 100644 index 0000000000..31d4a99be6 --- /dev/null +++ b/src/gpu/cl/operators/ClLogicalNot.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_LOGICAL_NOT_H +#define ARM_COMPUTE_CL_LOGICAL_NOT_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClElementWiseUnaryKernel for NOT operation */ +class ClLogicalNot : public IClOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: U8. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClLogicalNot::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_LOGICAL_NOT_H */ diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp new file mode 100644 index 0000000000..28a2aa2540 --- /dev/null +++ b/src/gpu/cl/operators/ClMatMul.cpp @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClMatMul.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h" +#include "src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h" +#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" +#include "src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h" +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h" +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h" +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h" +#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h" + +using namespace arm_compute::cl_matmul; + +namespace arm_compute +{ +namespace opencl +{ +using namespace arm_compute::opencl::kernels; + +ClMatMul::ClMatMul() +{ +} + +Status ClMatMul::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulInfo &matmul_info, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + + const GPUTarget gpu_target = CLScheduler::get().target(); + + std::unique_ptr<IClMatMulNativeKernelConfig> t = ClMatMulNativeKernelConfigurationFactory::create(gpu_target); + + const MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info); + + const auto kernel_selector = ClMatMulNativeKernelVariantFactory::create(gpu_target); + const MatMulKernelType kernel_type = kernel_selector->select_kernel(lhs, rhs, matmul_info, act_info); + + switch (kernel_type) + { + case MatMulKernelType::NATIVE_FP: + return ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); + case MatMulKernelType::NATIVE_MMUL_FP: + return ClMatMulNativeMMULKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info); + case MatMulKernelType::NATIVE_QUANTIZED: + return ClMatMulLowpNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); + case MatMulKernelType::NATIVE_MMUL_QUANTIZED: + return ClMatMulLowpNativeMMULKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); + default: + ARM_COMPUTE_ERROR("Unsupported MatMul Kernel!"); + } +} + +void ClMatMul::configure(const CLCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst, + const MatMulInfo &matmul_info, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); + ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_info); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, dst, matmul_info)); + + const GPUTarget gpu_target = CLScheduler::get().target(); + const auto kernel_config = ClMatMulNativeKernelConfigurationFactory::create(gpu_target); + const MatMulKernelInfo kernel_info = kernel_config->configure(lhs, rhs, matmul_info); + + const auto kernel_selector = ClMatMulNativeKernelVariantFactory::create(gpu_target); + const MatMulKernelType kernel_type = kernel_selector->select_kernel(lhs, rhs, matmul_info, act_info); + + switch (kernel_type) + { + case MatMulKernelType::NATIVE_FP: + { + auto kernel = std::make_unique<ClMatMulNativeKernel>(); + kernel->set_target(gpu_target); + + kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); + _matmul_kernel = std::move(kernel); + } + break; + case MatMulKernelType::NATIVE_MMUL_FP: + { + auto kernel = std::make_unique<ClMatMulNativeMMULKernel>(); + kernel->set_target(gpu_target); + + kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info); + _matmul_kernel = std::move(kernel); + } + break; + case MatMulKernelType::NATIVE_QUANTIZED: + { + auto kernel = std::make_unique<ClMatMulLowpNativeKernel>(); + kernel->set_target(gpu_target); + + kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); + _matmul_kernel = std::move(kernel); + } + break; + case MatMulKernelType::NATIVE_MMUL_QUANTIZED: + { + auto kernel = std::make_unique<ClMatMulLowpNativeMMULKernel>(); + kernel->set_target(gpu_target); + + kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); + _matmul_kernel = std::move(kernel); + } + break; + default: + ARM_COMPUTE_ERROR("Unsupported MatMul Kernel!"); + } +} + +void ClMatMul::run(ITensorPack &tensors) +{ + CLScheduler::get().enqueue_op(*_matmul_kernel, tensors, /* flush */ true); +} + +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClMatMul.h b/src/gpu/cl/operators/ClMatMul.h new file mode 100644 index 0000000000..1733def21c --- /dev/null +++ b/src/gpu/cl/operators/ClMatMul.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_GPU_CL_OPERATORS_CLMATMUL_H +#define ACL_SRC_GPU_CL_OPERATORS_CLMATMUL_H + +#include "arm_compute/function_info/ActivationLayerInfo.h" +#include "arm_compute/function_info/MatMulInfo.h" + +#include "src/gpu/cl/IClKernel.h" +#include "src/gpu/cl/IClOperator.h" + +#include <memory> + +namespace arm_compute +{ +namespace opencl +{ +/** Basic operator to execute BatchMatMul on OpenCL. This operator calls the following OpenCL kernels: + * + * -# @ref kernels::ClMatMulNativeKernel + */ +class ClMatMul : public IClOperator +{ +public: + /** Constructor */ + ClMatMul(); + /** Default destructor */ + ~ClMatMul() = default; + /** Initialise the kernel's inputs and output + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |lhs |rhs |dst | + * |:--------------|:--------------|:--------------| + * |F32 |F32 |F32 | + * |F16 |F16 |F16 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED | + * |QASYMM8 |QASYMM8 |QASYMM8 | + * + * @note BatchMatMul: Batched Matrix Multiply - [A * B], Multiplies all slices (slice is an element of a batch) of Tensors A and B + * and stores the result in the dst tensor of the same batch size. + * Batch here is number of slices from A and B multiplied at a time, do not confuse with the batch dimension 'N' of NHWC/NCHW + * For NHWC for example: the batch is the higher dimensions H * N, and in general it is H * all higher dimensions. + * @note All tensors must have the same data type. + * + * @param[in] compile_context The compile context to be used. + * @param[in] lhs Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8. + * @param[in] rhs Right-hand side tensor info. Data types supported: same as @p lhs. + * @param[out] dst Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs. + * @param[in] matmul_info Contains MatMul operation information described in @ref MatMulInfo. + * @param[in] act_info Class containing information about fused activation function. + */ + void configure(const CLCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst, + const MatMulInfo &matmul_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClMatMul::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulInfo &matmul_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + +private: + std::unique_ptr<opencl::IClKernel> _matmul_kernel{nullptr}; +}; +} // namespace opencl +} // namespace arm_compute +#endif // ACL_SRC_GPU_CL_OPERATORS_CLMATMUL_H diff --git a/src/gpu/cl/operators/ClMul.cpp b/src/gpu/cl/operators/ClMul.cpp new file mode 100644 index 0000000000..10cf8a6a38 --- /dev/null +++ b/src/gpu/cl/operators/ClMul.cpp @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClMul.h" + +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClMulKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClMul::configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info); + auto k = std::make_unique<kernels::ClMulKernel>(); + k->configure(compile_context, src1, src2, dst, scale, overflow_policy, rounding_policy, act_info); + _kernel = std::move(k); +} + +Status ClMul::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) +{ + return kernels::ClMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info); +} + +void ClComplexMul::configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) +{ + auto k = std::make_unique<kernels::ClComplexMulKernel>(); + k->configure(compile_context, src1, src2, dst, act_info); + _kernel = std::move(k); +} + +Status ClComplexMul::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) +{ + return kernels::ClComplexMulKernel::validate(src1, src2, dst, act_info); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClMul.h b/src/gpu/cl/operators/ClMul.h new file mode 100644 index 0000000000..1cf4d68d4c --- /dev/null +++ b/src/gpu/cl/operators/ClMul.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_MUL_H +#define ARM_COMPUTE_CL_MUL_H + +#include "arm_compute/core/Types.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref opencl::kernels::ClMulKernel */ +class ClMul : public IClOperator +{ +public: + /** Initialise the kernel's sources, dst and convertion policy. + * + * Valid configurations (src1,src2) -> Output : + * + * - (U8,U8) -> U8 + * - (U8,U8) -> S16 + * - (U8,S16) -> S16 + * - (S16,U8) -> S16 + * - (S16,S16) -> S16 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * - (QASYMM8,QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (QSYMM16,QSYMM16) -> QSYMM16 + * - (QSYMM16,QSYMM16) -> S32 + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] src1 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. + * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] src2 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. + * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. + * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate + * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClMul::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); +}; + +/** Basic function to run @ref opencl::kernels::ClComplexMulKernel */ +class ClComplexMul : public IClOperator +{ +public: + /** Initialise the kernel's sources, dst. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] src1 An src tensor info. Data types supported: F16/F32. Number of channels supported: 2. + * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] src2 An src tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1. + * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] dst The dst tensor info, Data types supported: same as @p src1. Number of channels supported: same as @p src1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClComplexMul::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_MUL_H */ diff --git a/src/gpu/cl/operators/ClPRelu.cpp b/src/gpu/cl/operators/ClPRelu.cpp new file mode 100644 index 0000000000..f3efd00bba --- /dev/null +++ b/src/gpu/cl/operators/ClPRelu.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClPRelu.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +using KernelType = kernels::ClArithmeticKernel; +void ClPRelu::configure(const CLCompileContext &compile_context, + ITensorInfo *input, + ITensorInfo *alpha, + ITensorInfo *output) +{ + ARM_COMPUTE_LOG_PARAMS(input, alpha, output); + auto k = std::make_unique<KernelType>(); + k->configure(compile_context, ArithmeticOperation::PRELU, input, alpha, (output == nullptr ? input : output)); + _kernel = std::move(k); +} + +Status ClPRelu::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output) +{ + return KernelType::validate(ArithmeticOperation::PRELU, input, alpha, (output == nullptr ? input : output)); +} + +void ClPRelu::run(ITensorPack &tensors) +{ + // Output tensor can be given as nullptr for in-place computation. + // In this case, get the input tensor and use it as the output tensor. + if (tensors.get_tensor(TensorType::ACL_DST) == nullptr) + { + auto src_tensor = const_cast<ITensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + ARM_COMPUTE_ERROR_ON_MSG(src_tensor == nullptr, "invalid source tensor is given for in-place computation"); + tensors.add_tensor(TensorType::ACL_DST, src_tensor); + } + IClOperator::run(tensors); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClPRelu.h b/src/gpu/cl/operators/ClPRelu.h new file mode 100644 index 0000000000..45ce858fb0 --- /dev/null +++ b/src/gpu/cl/operators/ClPRelu.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_PRELU_H +#define ARM_COMPUTE_CL_PRELU_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic operator to run @ref arm_compute::opencl::kernels::ClArithmeticKernel for PRELU + * + * @note The operator implements an activation layer with the PRELU activation function. + */ +class ClPRelu : public IClOperator +{ +public: + /** Set the input and output tensor. + * + * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] alpha PRelu layer parameters. Data types supported: same of @p input. + * @param[out] output Destination tensor. Data type supported: same as @p input + */ + void + configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClPRelu::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_PRELU_H */ diff --git a/src/gpu/cl/operators/ClPermute.cpp b/src/gpu/cl/operators/ClPermute.cpp new file mode 100644 index 0000000000..3851e22b6a --- /dev/null +++ b/src/gpu/cl/operators/ClPermute.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClPermute.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClPermuteKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClPermute::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const PermutationVector &perm) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst, perm); + auto k = std::make_unique<kernels::ClPermuteKernel>(); + k->configure(compile_context, src, dst, perm); + _kernel = std::move(k); +} + +Status ClPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm) +{ + return kernels::ClPermuteKernel::validate(src, dst, perm); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClPermute.h b/src/gpu/cl/operators/ClPermute.h new file mode 100644 index 0000000000..6349358a18 --- /dev/null +++ b/src/gpu/cl/operators/ClPermute.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_PERMUTE_H +#define ARM_COMPUTE_CL_PERMUTE_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClPermuteKernel */ +class ClPermute : public IClOperator +{ +public: + /** Initialise the kernel's inputs and outputs and permute vector + * + * @note Arbitrary permutation vectors are supported with rank not greater than 4 + * + * @param[in] compile_context The compile context to be used. + * @param[in] src The src tensor info. Data types supported: All. + * @param[in] dst The dst tensor info. Data types supported: Same as @p src + * @param[in] perm Permutation vector + */ + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const PermutationVector &perm); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClPermute::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_PERMUTE_H */ diff --git a/src/gpu/cl/operators/ClPool2d.cpp b/src/gpu/cl/operators/ClPool2d.cpp new file mode 100644 index 0000000000..e4507dc1a1 --- /dev/null +++ b/src/gpu/cl/operators/ClPool2d.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClPool2d.h" + +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClPool2dKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClPool2d::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &info, + ITensorInfo *indices) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_LOG_PARAMS(src, dst, info, indices); + + // Configure pooling kernel + auto k = std::make_unique<kernels::ClPool2dKernel>(); + k->set_target(CLScheduler::get().target()); + k->configure(compile_context, src, dst, info, indices); + _kernel = std::move(k); + + // Tune kernels + CLScheduler::get().tune_kernel_static(*_kernel); +} + +Status ClPool2d::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &info, + const ITensorInfo *indices) +{ + return kernels::ClPool2dKernel::validate(src, dst, info, indices); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClPool2d.h b/src/gpu/cl/operators/ClPool2d.h new file mode 100644 index 0000000000..9c2fd1c3f2 --- /dev/null +++ b/src/gpu/cl/operators/ClPool2d.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_POOL2D_H +#define ARM_COMPUTE_CL_POOL2D_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +#include <memory> + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels: + * + * -# @ref opencl::ClPool2d + */ +class ClPool2d : public IClOperator +{ +public: + /** Constructor */ + ClPool2d() = default; + /** Configure operator for a given list of arguments + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] dst Destination tensor info. Data type supported: same as @p src + * @param[in] info Pooling layer parameters. + * @param[out] indices (optional) The indices info of the maximal values. Data type supported: U32. + */ + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &info, + ITensorInfo *indices = nullptr); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClPool2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &info, + const ITensorInfo *indices = nullptr); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_POOL2D_H */ diff --git a/src/gpu/cl/operators/ClPool3d.cpp b/src/gpu/cl/operators/ClPool3d.cpp new file mode 100644 index 0000000000..d230413659 --- /dev/null +++ b/src/gpu/cl/operators/ClPool3d.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClPool3d.h" + +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClPool3dKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClPool3d::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const Pooling3dLayerInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_LOG_PARAMS(src, dst, info); + + // Configure pooling kernel + auto k = std::make_unique<kernels::ClPool3dKernel>(); + k->set_target(CLScheduler::get().target()); + k->configure(compile_context, src, dst, info); + _kernel = std::move(k); + + // Tune kernels + CLScheduler::get().tune_kernel_static(*_kernel); +} + +Status ClPool3d::validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &info) +{ + return kernels::ClPool3dKernel::validate(src, dst, info); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClPool3d.h b/src/gpu/cl/operators/ClPool3d.h new file mode 100644 index 0000000000..9fd78bfd69 --- /dev/null +++ b/src/gpu/cl/operators/ClPool3d.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_POOL3D_H +#define ARM_COMPUTE_CL_POOL3D_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +#include <memory> + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels: + * + * -# @ref opencl::ClPool3d + */ +class ClPool3d : public IClOperator +{ +public: + /** Constructor */ + ClPool3d() = default; + /** Configure operator for a given list of arguments + * + * @note Asymmetric padding is not supported when dimension rounding type == CEIL. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. + * @param[out] dst Destination tensor info. + * @param[in] info 3d Pooling layer parameters. + */ + void configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const Pooling3dLayerInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClPool3d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &info); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_POOL3D_H */ diff --git a/src/gpu/cl/operators/ClQuantize.cpp b/src/gpu/cl/operators/ClQuantize.cpp new file mode 100644 index 0000000000..8560b5553e --- /dev/null +++ b/src/gpu/cl/operators/ClQuantize.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClQuantize.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClQuantizeKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClQuantize::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::ClQuantizeKernel>(); + k->configure(compile_context, src, dst); + _kernel = std::move(k); +} + +Status ClQuantize::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClQuantizeKernel::validate(src, dst); +} + +void ClQuantize::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + CLScheduler::get().enqueue_op(*_kernel.get(), tensors); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClQuantize.h b/src/gpu/cl/operators/ClQuantize.h new file mode 100644 index 0000000000..3e50fcefb3 --- /dev/null +++ b/src/gpu/cl/operators/ClQuantize.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_QUANTIZE_H +#define ARM_COMPUTE_CL_QUANTIZE_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClQuantizeKernel that dequantizes an input tensor */ +class ClQuantize : public IClOperator +{ +public: + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/32. + * @param[out] dst Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16. + * + * @note Output auto initialization is not supported by this function + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClQuantize::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited method overridden + void run(ITensorPack &tensors) override; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_QUANTIZE_H */ diff --git a/src/gpu/cl/operators/ClReshape.cpp b/src/gpu/cl/operators/ClReshape.cpp new file mode 100644 index 0000000000..1dd5b760cb --- /dev/null +++ b/src/gpu/cl/operators/ClReshape.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClReshape.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClReshapeKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClReshape::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::ClReshapeKernel>(); + k->configure(compile_context, src, dst); + _kernel = std::move(k); +} + +Status ClReshape::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClReshapeKernel::validate(src, dst); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClReshape.h b/src/gpu/cl/operators/ClReshape.h new file mode 100644 index 0000000000..fee69a1c24 --- /dev/null +++ b/src/gpu/cl/operators/ClReshape.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_RESHAPE_H +#define ARM_COMPUTE_CL_RESHAPE_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClReshapeKernel */ +class ClReshape : public IClOperator +{ +public: + /** Initialise the kernel's inputs and outputs + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor info. Data type supported: All + * @param[out] output Output info. Data type supported: Same as @p input + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClReshape::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_RESHAPE_H */
\ No newline at end of file diff --git a/src/gpu/cl/operators/ClScale.cpp b/src/gpu/cl/operators/ClScale.cpp new file mode 100644 index 0000000000..184e2aa006 --- /dev/null +++ b/src/gpu/cl/operators/ClScale.cpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClScale.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClScaleKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClScale::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const ScaleKernelInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_LOG_PARAMS(src, dst, info); + + // Configure Scale kernel + auto k = std::make_unique<kernels::ClScaleKernel>(); + k->set_target(CLScheduler::get().target()); + k->configure(compile_context, src, dst, info); + _kernel = std::move(k); + + // Tune kernel + CLScheduler::get().tune_kernel_static(*_kernel); +} + +Status ClScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info) +{ + return kernels::ClScaleKernel::validate(src, dst, info); +} + +void ClScale::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + CLScheduler::get().enqueue_op(*_kernel.get(), tensors); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClScale.h b/src/gpu/cl/operators/ClScale.h new file mode 100644 index 0000000000..1427bb4fdc --- /dev/null +++ b/src/gpu/cl/operators/ClScale.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_SCALE_H +#define ARM_COMPUTE_CL_SCALE_H + +#include "arm_compute/core/KernelDescriptors.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to simulate a scale layer. This function calls the following OpenCL kernels: + * + * -# @ref kernels::ClScaleKernel + */ +class ClScale : public IClOperator +{ +public: + /** Constructor */ + ClScale() = default; + /** Initialize the function's source, destination, interpolation type and border_mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] src Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED) + * @param[out] dst Destination tensor info. Data types supported: Same as @p src + * All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. + * @param[in] info @ref ScaleKernelInfo descriptor to be used to configure + */ + void + configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClScale::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info); + + // Inherited method overridden + void run(ITensorPack &tensors) override; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CLSCALE_H */ diff --git a/src/gpu/cl/operators/ClScatter.cpp b/src/gpu/cl/operators/ClScatter.cpp new file mode 100644 index 0000000000..a11ecd7e6a --- /dev/null +++ b/src/gpu/cl/operators/ClScatter.cpp @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClScatter.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClCopyKernel.h" +#include "src/gpu/cl/kernels/ClFillKernel.h" +#include "src/gpu/cl/kernels/ClScatterKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +using namespace arm_compute::opencl::kernels; + +ClScatter::ClScatter() +{ +} + +Status ClScatter::validate(const ITensorInfo *src, + const ITensorInfo *updates, + const ITensorInfo *indices, + const ITensorInfo *dst, + const ScatterInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(updates, indices, dst); + if (src != nullptr) + { + // Check dst/src are same shape and datatype. + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, updates, dst); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClCopyKernel::validate(src, dst)); // Validate Copy kernel + } + if (src != dst) + { + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClFillKernel::validate(dst, PixelValue(0.0f))); // Validate Fill kernel. + } + + return kernels::ClScatterKernel::validate(updates, indices, dst, info); +} + +void ClScatter::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *updates, + const ITensorInfo *indices, + ITensorInfo *dst, + const ScatterInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(updates, indices, dst); + ARM_COMPUTE_LOG_PARAMS(src, indices, dst, info); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate(src, updates, indices, dst, info)); + _fill_zero = info.zero_initialization; + + // If necessary, create fill kernel to fill dst tensor. + if (_fill_zero) + { + auto f = std::make_unique<kernels::ClFillKernel>(); + f->configure(compile_context, dst, PixelValue(0.0f)); + _fill_kernel = std::move(f); + } + else if (src != dst) // Check whether copying is necessary + { + // Fill dst with src copy here. + auto j = std::make_unique<kernels::ClCopyKernel>(); + j->configure(compile_context, src, dst); + _copy_kernel = std::move(j); + _run_copy = true; + } + + // Configure ClScatterKernel + auto k = std::make_unique<kernels::ClScatterKernel>(); + k->set_target(CLScheduler::get().target()); + k->configure(compile_context, updates, indices, dst, info); + _scatter_kernel = std::move(k); +} + +void ClScatter::run(ITensorPack &tensors) +{ + // Get tensors. + auto src = tensors.get_const_tensor(ACL_SRC_0); + auto updates = tensors.get_const_tensor(ACL_SRC_1); + auto indices = tensors.get_const_tensor(ACL_SRC_2); + auto dst = tensors.get_tensor(ACL_DST); + + if (_fill_zero) + { + // Fill destination tensor with 0 values if zero init. + ITensorPack fill_pack{{ACL_SRC, dst}}; + CLScheduler::get().enqueue_op(*_fill_kernel, fill_pack, false); + } + + if (_run_copy) + { + // copy src to dst before scatter op. + ITensorPack copy_pack{{ACL_SRC, src}, {ACL_DST, dst}}; + CLScheduler::get().enqueue_op(*_copy_kernel, copy_pack, false); + } + + ITensorPack scatter_pack{{ACL_SRC_0, updates}, {ACL_SRC_1, indices}, {ACL_DST, dst}}; + CLScheduler::get().enqueue_op(*_scatter_kernel, scatter_pack, false); +} + +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClScatter.h b/src/gpu/cl/operators/ClScatter.h new file mode 100644 index 0000000000..a1b32fed45 --- /dev/null +++ b/src/gpu/cl/operators/ClScatter.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2024 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ACL_SRC_GPU_CL_OPERATORS_CLSCATTER_H +#define ACL_SRC_GPU_CL_OPERATORS_CLSCATTER_H + +#include "arm_compute/function_info/ScatterInfo.h" + +#include "src/gpu/cl/IClKernel.h" +#include "src/gpu/cl/IClOperator.h" + +#include <memory> + +namespace arm_compute +{ +namespace opencl +{ +// Forward declaration +class ClFillKernel; +class ClScatterKernel; +class ClCopyKernel; + +/** Basic operator to execute Scatter on OpenCL. This operator calls the following OpenCL kernels: + * + * -# @ref kernels::ClScatterKernel + */ +class ClScatter : public IClOperator +{ +public: + /** Constructor */ + ClScatter(); + /** Default destructor */ + ~ClScatter() = default; + /** Initialise the kernel's inputs and output + * + * Valid data layouts: + * - All + * + * @note indices must always be S32. + * @note Negative indices are treated as out of bounds. + * @note src, updates and dst tensors must be same datatype. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source input tensor info. Can be nullptr when using "Add" Scatter Function with zero initialization. + * @param[in] updates Tensor info for tensor storing update values to use for scatter function. Data types supported: same as @p src. + * @param[in] indices Tensor info for tensor storing indices to use for scatter function. Data types supported: S32 only. + * @param[out] dst Output tensor to store the result of the Scatter Function. Data types supported: same as @p src and @p updates. + * @param[in] Scatter_info Contains Scatter operation information described in @ref ScatterInfo. + */ + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *updates, + const ITensorInfo *indices, + ITensorInfo *dst, + const ScatterInfo &Scatter_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClScatter::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *updates, + const ITensorInfo *indices, + const ITensorInfo *dst, + const ScatterInfo &Scatter_info); + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + +private: + std::unique_ptr<opencl::IClKernel> _scatter_kernel{nullptr}; + std::unique_ptr<opencl::IClKernel> _fill_kernel{nullptr}; + std::unique_ptr<opencl::IClKernel> _copy_kernel{nullptr}; + bool _fill_zero{false}; + bool _run_copy{false}; +}; +} // namespace opencl +} // namespace arm_compute +#endif // ACL_SRC_GPU_CL_OPERATORS_CLSCATTER_H diff --git a/src/gpu/cl/operators/ClSoftmax.cpp b/src/gpu/cl/operators/ClSoftmax.cpp new file mode 100644 index 0000000000..427f6b4f92 --- /dev/null +++ b/src/gpu/cl/operators/ClSoftmax.cpp @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClSoftmax.h" + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/gpu/cl/kernels/ClSoftmaxKernel.h" +#include "src/gpu/cl/utils/ClAuxTensorHandler.h" + +using namespace arm_compute::experimental; + +namespace arm_compute +{ +namespace opencl +{ + +ClSoftmax::ClSoftmax() : _aux_mem(InternalTensorIdx::COUNT) +{ +} + +void ClSoftmax::configure(const CLCompileContext &compile_context, + const ITensorInfo &src, + ITensorInfo &dst, + const SoftmaxKernelInfo &info) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst, info); + + auto k = std::make_unique<kernels::ClSoftmaxKernel>(); + k->configure(compile_context, src, dst, info); + + _tmp_info = k->tmp_tensor_info(); + + _kernel = std::move(k); + + _aux_mem[InternalTensorIdx::TMP] = + MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size()); +} + +Status ClSoftmax::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info) +{ + return kernels::ClSoftmaxKernel::validate(src, dst, info); +} + +void ClSoftmax::run(ITensorPack &tensors) +{ + CLAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp_info, tensors); + + tensors.add_tensor(TensorType::ACL_INT_0, tmp.get()); + + CLScheduler::get().enqueue_op(*_kernel, tensors, false); +} + +experimental::MemoryRequirements ClSoftmax::workspace() const +{ + return _aux_mem; +} + +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClSoftmax.h b/src/gpu/cl/operators/ClSoftmax.h new file mode 100644 index 0000000000..232fcfebd1 --- /dev/null +++ b/src/gpu/cl/operators/ClSoftmax.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_GPU_CL_OPERATORS_CLSOFTMAX_H +#define ACL_SRC_GPU_CL_OPERATORS_CLSOFTMAX_H + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/runtime/CL/CLTensor.h" + +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +class CLCompileContext; +class ITensorInfo; +class ITensorPack; +struct SoftmaxKernelInfo; + +namespace opencl +{ +namespace kernels +{ +class ClSoftmaxKernel; +} // namespace kernels +class ClSoftmax : public IClOperator +{ +public: + /** Constructor */ + ClSoftmax(); + /** Configure the operator + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax + * @param[out] dst Destination tensor info. Data types supported: same as @p src + * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo. + */ + void configure(const CLCompileContext &compile_context, + const ITensorInfo &src, + ITensorInfo &dst, + const SoftmaxKernelInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClSoftmax::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info); + + void run(ITensorPack &tensors) override; + + experimental::MemoryRequirements workspace() const override; + +private: + enum InternalTensorIdx + { + TMP = 0, + COUNT, + }; + + TensorInfo _tmp_info{}; + experimental::MemoryRequirements _aux_mem; +}; + +} // namespace opencl +} // namespace arm_compute +#endif // ACL_SRC_GPU_CL_OPERATORS_CLSOFTMAX_H diff --git a/src/gpu/cl/operators/ClSub.cpp b/src/gpu/cl/operators/ClSub.cpp new file mode 100644 index 0000000000..5c6d0c3184 --- /dev/null +++ b/src/gpu/cl/operators/ClSub.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClSub.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClSub::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, policy, act_info); + auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>(); + k->configure(compile_context, ArithmeticOperation::SUB, src1, src2, dst, policy, act_info); + _kernel = std::move(k); +} + +Status ClSub::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) +{ + return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::SUB, src1, src2, dst, policy, act_info); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClSub.h b/src/gpu/cl/operators/ClSub.h new file mode 100644 index 0000000000..6a97275b86 --- /dev/null +++ b/src/gpu/cl/operators/ClSub.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2021, 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_SUB_H +#define ARM_COMPUTE_CL_SUB_H + +#include "arm_compute/function_info/ActivationLayerInfo.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run arithmetic subtraction + * + * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * @note The function performs an arithmetic subtraction between two tensors. + */ +class ClSub : public IClOperator +{ +public: + /** Configure function for a given list of arguments. + * + * Valid configurations (src1,src2) -> dst : + * + * - (U8,U8) -> U8 + * - (U8,U8) -> S16 + * - (S16,U8) -> S16 + * - (U8,S16) -> S16 + * - (S16,S16) -> S16 + * - (S32,S32) -> S32 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * - (QASYMM8,QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (QSYMM16,QSYMM16) -> QSYMM16 + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] src2 Second source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] dst Destination tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * @param[in] policy Policy to use to handle overflow. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClSub::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_SUB_H */ diff --git a/src/gpu/cl/operators/ClTranspose.cpp b/src/gpu/cl/operators/ClTranspose.cpp new file mode 100644 index 0000000000..28da0d640a --- /dev/null +++ b/src/gpu/cl/operators/ClTranspose.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClTranspose.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClTransposeKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClTranspose::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_LOG_PARAMS(src, dst); + auto k = std::make_unique<kernels::ClTransposeKernel>(); + k->configure(compile_context, src, dst); + _kernel = std::move(k); +} + +Status ClTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClTransposeKernel::validate(src, dst); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClTranspose.h b/src/gpu/cl/operators/ClTranspose.h new file mode 100644 index 0000000000..3642fc23f9 --- /dev/null +++ b/src/gpu/cl/operators/ClTranspose.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_TRANSPOSE_H +#define ARM_COMPUTE_CL_TRANSPOSE_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClTransposeKernel */ +class ClTranspose : public IClOperator +{ +public: + /** Initialise the kernel's inputs and outputs + * + * @param[in] compile_context The compile context to be used. + * @param[in] src The src tensor info. Data types supported: All. + * @param[in] dst The dst tensor info. Data types supported: Same as @p src + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClTranspose::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_TRANSPOSE_H */ diff --git a/src/gpu/cl/operators/ClTransposedConvolution.cpp b/src/gpu/cl/operators/ClTransposedConvolution.cpp new file mode 100644 index 0000000000..cec438faeb --- /dev/null +++ b/src/gpu/cl/operators/ClTransposedConvolution.cpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClTransposedConvolution.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClTransposedConvolutionKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClTransposedConvolution::configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *output, + const PadStrideInfo &deconv_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, deconv_info); + auto kernel_object = std::make_unique<kernels::ClTransposedConvolutionKernel>(); + kernel_object->set_target(CLScheduler::get().target()); + kernel_object->configure(compile_context, input, weights, biases, output, deconv_info); + _transposed_conv_kernel = std::move(kernel_object); +} + +Status ClTransposedConvolution::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &deconv_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClTransposedConvolutionKernel::validate(input, weights, biases, output, deconv_info)); + return Status{}; +} + +void ClTransposedConvolution::run(ITensorPack &tensors) +{ + CLScheduler::get().enqueue_op(*_transposed_conv_kernel.get(), tensors, false); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClTransposedConvolution.h b/src/gpu/cl/operators/ClTransposedConvolution.h new file mode 100644 index 0000000000..660c4f85c1 --- /dev/null +++ b/src/gpu/cl/operators/ClTransposedConvolution.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H +#define ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" +#include "src/gpu/cl/IClOperator.h" +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to simulate a directly convolution layer. This function calls the following OpenCL kernels: + * + * -# @ref opencl::ClTransposedConvolution + */ +class ClTransposedConvolution : public IClOperator +{ +public: + /** Default constructor */ + ClTransposedConvolution() = default; + /** Default Destructor */ + ~ClTransposedConvolution() = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + ClTransposedConvolution(const ClTransposedConvolution &) = delete; + /** Default move constructor */ + ClTransposedConvolution(ClTransposedConvolution &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + ClTransposedConvolution &operator=(const ClTransposedConvolution &) = delete; + /** Default move assignment operator */ + ClTransposedConvolution &operator=(ClTransposedConvolution &&) = default; + + /** Set the input, weights, biases and output tensors. + * + * @note: Only NHWC data layout is supported + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor info with dimensions [IFM, width, height, batch] + * Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED. + * @param[in] weights Weight tensor info with dimensions [IFM, width, height, OFM]. + * Data type supported: Same as @p input + * @param[in] biases (Optional) Biases tensor info. Biases are 1D tensor with dimension [OFM]. + * Data type supported: Should match @p input data type if floating point, otherwise S32. + * @param[out] output Output tensor info with dimensions [OFM, width, height, batch] + * The 1st dimension must be equal to the 4th dimension of the @p weights tensor. + * Data types supported: Same as @p input. + * @param[in] deconv_info Contains padding and stride information described in @ref PadStrideInfo. + * + */ + void configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *output, + const PadStrideInfo &deconv_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClTransposedConvolution::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &deconv_info); + + // Inherited method overridden + void run(ITensorPack &tensors) override; + +private: + std::unique_ptr<IClKernel> _transposed_conv_kernel{nullptr}; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H */ diff --git a/src/gpu/cl/operators/ClWinogradConv2d.cpp b/src/gpu/cl/operators/ClWinogradConv2d.cpp new file mode 100644 index 0000000000..8ec96b247e --- /dev/null +++ b/src/gpu/cl/operators/ClWinogradConv2d.cpp @@ -0,0 +1,328 @@ +/* + * Copyright (c) 2018-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClWinogradConv2d.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h" +#include "src/gpu/cl/kernels/ClWinogradInputTransformKernel.h" +#include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h" +#include "src/gpu/cl/utils/ClAuxTensorHandler.h" +#include "support/Cast.h" + +using namespace arm_compute::experimental; + +namespace arm_compute +{ +namespace opencl +{ +namespace +{ +Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataLayout data_layout) +{ + Size2D output_tile = Size2D{}; + + const unsigned int kernel_max_dim = std::max(kernel_dims.width, kernel_dims.height); + + // Check if the input spatial dimensions are smaller than 4 + const bool is_input_lt4_nchw = + (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW); + + if (kernel_max_dim == 3U) + { + if (kernel_dims == Size2D(3U, 3U)) + { + output_tile = is_input_lt4_nchw ? Size2D(2U, 2U) : Size2D(4U, 4U); + } + else if (kernel_dims == Size2D(3U, 1U)) + { + output_tile = is_input_lt4_nchw ? Size2D(2U, 1U) : Size2D(4U, 1U); + } + else + { + output_tile = is_input_lt4_nchw ? Size2D(1U, 2U) : Size2D(1U, 4U); + } + } + else if (kernel_max_dim == 5U) + { + output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U, kernel_dims.height == 1 ? 1U : 4U); + } + else if (kernel_max_dim == 7U) + { + output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U, kernel_dims.height == 1 ? 1U : 2U); + } + + return output_tile; +} + +bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size) +{ + // Check if we want to configure a Winograd configuration which requires fast math + using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>; + + std::vector<WinogradConfiguration> fast_math_winograd = { + WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)), + WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))}; + + auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height), + std::pair<int, int>(kernel_size.width, kernel_size.height)); + + return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end(); +} + +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) +{ + // Get indeces for the width and height + const size_t idx_width = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_height = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT); + + // Input shape, kernel size and output tile + const Size2D input_dims = Size2D(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height]); + const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]); + const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + ((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))), + "Winograd only supports padding up to half kernel size"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + ((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))), + "Winograd only supports padding up to half kernel size"); + + // Check if the Winograd configuration requires fast math + if (!enable_fast_math) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + src, 1, DataType::F32); //disable winograd for fp16 if fast math is false. + ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), + "This Winograd configuration requires enable_fast_math=true"); + } + + const WinogradInfo winograd_info = + WinogradInfo(output_tile, kernel_size, input_dims, conv_info, src->data_layout()); + + // Validate input transform + const TensorShape input0_shape = + misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info); + const TensorInfo input0 = src->clone()->set_tensor_shape(input0_shape); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradInputTransformKernel::validate(src, &input0, winograd_info)); + + // Validate filter transform + const TensorShape input1_shape = + misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info); + const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradFilterTransformKernel::validate(weights, &input1, winograd_info)); + + // Validate batched matrix multiply + TensorShape batched_mm_output_shape = input0.tensor_shape(); + batched_mm_output_shape[0] = input1.tensor_shape()[0]; + const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape); + ARM_COMPUTE_RETURN_ON_ERROR( + ClGemm::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, + GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, + GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16)))); + + // Configure output transform + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClWinogradOutputTransformKernel::validate(&batched_mm_output, biases, dst, winograd_info, act_info)); + return Status{}; +} + +} // namespace + +ClWinogradConv2d::ClWinogradConv2d() + : _batched_mm(), + _input_transform(std::make_unique<kernels::ClWinogradInputTransformKernel>()), + _filter_transform(std::make_unique<kernels::ClWinogradFilterTransformKernel>()), + _output_transform(std::make_unique<kernels::ClWinogradOutputTransformKernel>()), + _border_handler(), + _input0(), + _input1(), + _batched_mm_output(), + _is_prepared(false), + _aux_mem() +{ +} + +ClWinogradConv2d::~ClWinogradConv2d() = default; + +void ClWinogradConv2d::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math)); + ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info, enable_fast_math); + + // Get indices for the width and height + const size_t idx_width = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_height = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT); + + // Input shape, kernel size and output tile + const Size2D input_dims = Size2D(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height]); + const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]); + const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout()); + + // Check if the Winograd configuration requires fast math + if (!enable_fast_math) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, + DataType::F32); //disable winograd for fp16 if fast math is false. + ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), + "This Winograd configuration requires enable_fast_math=true"); + } + const WinogradInfo winograd_info = + WinogradInfo(output_tile, kernel_size, input_dims, conv_info, src->data_layout()); + + _is_prepared = false; + + // Configure input transform + _input_transform->configure(compile_context, src, &_input0, winograd_info); + _border_handler.configure(compile_context, src, _input_transform->border_size(), BorderMode::CONSTANT, + PixelValue()); + + // Configure filter transform + _filter_transform->configure(compile_context, weights, &_input1, winograd_info); + + // Configure batched matrix multiply + _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, + GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, + GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16))); + + // Configure output transform + _output_transform->set_target(CLScheduler::get().target()); + _output_transform->configure(compile_context, &_batched_mm_output, biases, dst, winograd_info, act_info); + + _aux_mem = _batched_mm.workspace(); + const MemoryLifetime wino_wei_lifetm = + std::any_of(std::begin(_aux_mem), std::end(_aux_mem), + [](const auto &r) { return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0); }) + ? MemoryLifetime::Prepare + : MemoryLifetime::Persistent; + _aux_mem.push_back(MemoryInfo(offset_int_vec(2), MemoryLifetime::Temporary, _input0.total_size())); + _aux_mem.push_back(MemoryInfo(offset_int_vec(3), wino_wei_lifetm, _input1.total_size())); + _aux_mem.push_back(MemoryInfo(offset_int_vec(4), MemoryLifetime::Temporary, _batched_mm_output.total_size())); +} + +Status ClWinogradConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math)); + return Status{}; +} + +void ClWinogradConv2d::run(ITensorPack &tensors) +{ + const bool is_gemm_reshaped = _aux_mem[3].lifetime == MemoryLifetime::Prepare; + + auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + auto biases = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + CLAuxTensorHandler input0(offset_int_vec(2), _input0, tensors, true); + CLAuxTensorHandler input1(offset_int_vec(3), _input1, tensors, true, is_gemm_reshaped); + CLAuxTensorHandler batched_mm_output(offset_int_vec(4), _batched_mm_output, tensors, true); + + prepare(tensors); + + // Run input transform + ITensorPack pack_it{ + {TensorType::ACL_SRC, src}, + {TensorType::ACL_DST, input0.get()}, + }; + CLScheduler::get().enqueue_op(_border_handler, pack_it, false); + CLScheduler::get().enqueue_op(*_input_transform, pack_it, false); + + // Run batched matrix multiplication + ITensorPack pack_mm = tensors; + pack_mm.add_const_tensor(TensorType::ACL_SRC_0, input0.get()); + pack_mm.add_tensor(TensorType::ACL_DST, batched_mm_output.get()); + is_gemm_reshaped ? pack_mm.remove_tensor(TensorType::ACL_SRC_1) + : pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get()); + _batched_mm.run(pack_mm); + + // Run output transform + ITensorPack pack_ot{ + {TensorType::ACL_SRC_0, batched_mm_output.get()}, + {TensorType::ACL_SRC_1, biases}, + {TensorType::ACL_DST, dst}, + }; + CLScheduler::get().enqueue_op(*_output_transform, pack_ot); +} + +void ClWinogradConv2d::prepare(ITensorPack &tensors) +{ + if (!_is_prepared) + { + auto weights = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + ICLTensor *in1_aux = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(3))); + + CLAuxTensorHandler input1(_input1, *in1_aux); + ITensorPack pack_ft{ + {TensorType::ACL_SRC, weights}, + {TensorType::ACL_DST, input1.get()}, + }; + // Run filter transform and mark original weights as unused + CLScheduler::get().enqueue_op(*_filter_transform, pack_ft, false); + weights->mark_as_unused(); + + // Prepare GEMM and release reshaped weights if marked unused by ClGemm + ITensorPack mm_prepare_pack = tensors; + mm_prepare_pack.add_tensor(ACL_SRC_1, input1.get()); + _batched_mm.prepare(mm_prepare_pack); + + CLScheduler::get().queue().finish(); + _is_prepared = true; + } +} + +experimental::MemoryRequirements ClWinogradConv2d::workspace() const +{ + return _aux_mem; +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClWinogradConv2d.h b/src/gpu/cl/operators/ClWinogradConv2d.h new file mode 100644 index 0000000000..54ec1a1737 --- /dev/null +++ b/src/gpu/cl/operators/ClWinogradConv2d.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_WINOGRADCONV2D_H +#define ARM_COMPUTE_CL_WINOGRADCONV2D_H + +#include "arm_compute/runtime/CL/CLTensor.h" + +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" +#include "src/gpu/cl/operators/ClGemm.h" + +namespace arm_compute +{ +class CLCompileContext; +class ITensorInfo; +namespace opencl +{ +namespace kernels +{ +class ClWinogradInputTransformKernel; +class ClWinogradFilterTransformKernel; +class ClWinogradOutputTransformKernel; +} // namespace kernels +/** Basic function to execute Winograd-based convolution on OpenCL. This function calls the following OpenCL functions/kernels: + * + * -# @ref kernels::ClWinogradInputTransformKernel + * -# @ref kernels::ClWinogradFilterTransformKernel (only once) + * -# @ref ClGemm + * -# @ref kernels::ClWinogradOutputTransformKernel + * + */ +class ClWinogradConv2d : public IClOperator +{ +public: + /** Default constructor */ + ClWinogradConv2d(); + /** Default destructor */ + ~ClWinogradConv2d(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + ClWinogradConv2d(const ClWinogradConv2d &) = delete; + /** Default move constructor */ + ClWinogradConv2d(ClWinogradConv2d &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + ClWinogradConv2d &operator=(const ClWinogradConv2d &) = delete; + /** Default move assignment operator */ + ClWinogradConv2d &operator=(ClWinogradConv2d &&) = default; + /** Set the input and output tensors. + * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:--------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * + * @note: This function only works with 3x3,3x1,1x3,5x5,5x1,1x5,7x1 and 1x7 kernels along with unit strides for both NCHW and NHWC data layout + * @note Some Winograd configurations (i.e. F(4x4, 5x5)) are supported only with enable_fast_math = true + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: F16/F32. + * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p src + * @param[out] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p src. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + */ + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClWinogradConv2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); + + // Inherited method overridden + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + ClGemm _batched_mm; + std::unique_ptr<kernels::ClWinogradInputTransformKernel> _input_transform; + std::unique_ptr<kernels::ClWinogradFilterTransformKernel> _filter_transform; + std::unique_ptr<kernels::ClWinogradOutputTransformKernel> _output_transform; + CLFillBorderKernel _border_handler; + TensorInfo _input0; + TensorInfo _input1; + TensorInfo _batched_mm_output; + bool _is_prepared; + experimental::MemoryRequirements _aux_mem{}; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_WINOGRADCONV2D_H */ |