From 404462af4ca002ece819161a03a4bdb19a87abf2 Mon Sep 17 00:00:00 2001 From: Ramy Elgammal Date: Tue, 8 Nov 2022 02:14:46 +0000 Subject: Adding GpuAdd to dynamic fusion operators - Provide support for Add operator - Auto initialize the destination tensor before testing fusion in conv2d and elementwise binary ops. Resolves: COMPMID-5518 Signed-off-by: Ramy Elgammal Change-Id: Ibd815020f02b57f88eea7c2921bdcf98605d99c5 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8617 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gunes Bayir Benchmark: Arm Jenkins --- .../components/cl/ClComponentElementwiseBinary.cpp | 112 ++++++++ .../components/cl/ClComponentElementwiseBinary.h | 118 ++++++++ src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp | 74 +++++ .../sketch/gpu/operators/GpuConv2d.cpp | 29 +- .../internal/GpuElementwiseBinaryCommon.cpp | 200 +++++++++++++ .../internal/GpuElementwiseBinaryCommon.h | 113 ++++++++ .../template_writer/IGpuTemplateComponentWriter.h | 3 + .../template_writer/cl/ClTemplateDirectConv2d.cpp | 4 +- .../cl/ClTemplateElementwiseBinary.cpp | 315 +++++++++++++++++++++ .../cl/ClTemplateElementwiseBinary.h | 117 ++++++++ .../gpu/template_writer/cl/ClTemplateStore.cpp | 1 - 11 files changed, 1068 insertions(+), 18 deletions(-) create mode 100644 src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp create mode 100644 src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h create mode 100644 src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp create mode 100644 src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp create mode 100644 src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h create mode 100644 src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp create mode 100644 src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h (limited to 'src/dynamic_fusion/sketch/gpu') diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp new file mode 100644 index 0000000000..a17d835ac6 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClComponentElementwiseBinary.h" + +#include "arm_compute/core/Validate.h" +#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h" +#include "src/core/CL/CLValidate.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +std::set supported_ops +{ + ElementwiseBinaryCommonAttributes::ElementwiseOp::ADD +}; +} + +Status ClComponentElementwiseBinary::validate(const ArgumentPack &tensors, const ElementwiseBinaryCommonAttributes &attributes) +{ + const auto lhs = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto rhs = tensors.get_const_tensor(TensorType::ACL_SRC_1); + const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0); + + // Check operator type + ARM_COMPUTE_RETURN_ERROR_ON_MSG(supported_ops.find(attributes.operation()) == supported_ops.end(), "Provided Elementwise operation not supported."); + + // Check validity + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); + + //Check data type for different elementwise operators + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::S32, DataType::S16, DataType::U8); + + const bool rhs_in_place = (rhs == dst); + const bool lhs_in_place = (lhs == dst); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_in_place && lhs_in_place, "Both LHS and RHS cannot be in-place at same time for any elementwise operation."); + + // dst shape is correct + const TensorShape out_shape = TensorShape::broadcast_shape(lhs->tensor_shape(), rhs->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((!rhs_in_place && !lhs_in_place) && detail::have_different_dimensions(lhs->tensor_shape(), dst->tensor_shape(), 0), + "Only the rhs operand can be broadcast to match the accumulator's (lhs) shape"); + // Matching data type + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); + + // Matching data layout + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, rhs); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, dst); + + // Batching case not supported yet + const size_t idx_batch = get_data_layout_dimension_index(lhs->data_layout(), DataLayoutDimension::BATCHES); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs->tensor_shape()[idx_batch] != 1) || (rhs->tensor_shape()[idx_batch] != 1) || (dst->tensor_shape()[idx_batch] != 1), "Batching case not supported yet"); + + // All tensor infos are initialized + ARM_COMPUTE_RETURN_ERROR_ON(lhs->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(rhs->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); + + // Device requirements are met + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(lhs); + + return Status{}; +} + +ClComponentElementwiseBinary::ClComponentElementwiseBinary( + ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes) + : IGpuKernelComponent{ id, properties, tensors }, + _component_writer{ std::make_unique(id, tensors, attributes) } +{ +} +ClComponentElementwiseBinary::~ClComponentElementwiseBinary() +{ +} +const IGpuTemplateComponentWriter *ClComponentElementwiseBinary::template_writer() const +{ + return _component_writer.get(); +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h new file mode 100644 index 0000000000..02e61019f4 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY + +#include "arm_compute/core/Error.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" +#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" + +namespace arm_compute +{ +/** Forward declaration */ +class ITensorInfo; +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +template +class ArgumentPack; + +/** Forward declaration */ +class ClTemplateElementwiseBinary; + +class ClComponentElementwiseBinary final : public IGpuKernelComponent +{ +public: + /** Attributes are a set of backend-agnostic parameters that define what a component does */ + using Attributes = ElementwiseBinaryCommonAttributes; + +public: + /** Validate the component + * + * @param[in,out] tensors Tensor arguments to the component + * @param[in] attributes Component attributes + * + * @return Status Validation results + * + * Tensor argument names: + * - ACL_SRC_0: lhs + * - ACL_SRC_1: rhs + * - ACL_DST_0: dst + * + * Tensor argument constness: + * - ACL_SRC_0: Const + * - ACL_SRC_1: Const + * - ACL_DST_0: Const + * + * Valid data layouts: + * - All + * + * Valid data type configurations (for DIV FP32/FP16/S32 supported, for POWER only FP32/FP16 supported): + * |ACL_SRC_0 |ACL_SRC_1 |ACL_DST_0 | + * |:--------------|:--------------|:--------------| + * |F16 |F16 |F16 | + * |F32 |F32 |F32 | + * |S32 |S32 |S32 | + * |S16 |S16 |S16 | + * |U8 |U8 |U8 | + */ + static Status validate(const ArgumentPack &tensors, const ElementwiseBinaryCommonAttributes &attributes); + + /** Constructor + * + * Similar to @ref ClComponentElementwiseBinary::validate() + */ + ClComponentElementwiseBinary( + ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes); + + /** Destructor */ + ~ClComponentElementwiseBinary() override; + /** Prevent instances of this class from being copy constructed */ + ClComponentElementwiseBinary(const ClComponentElementwiseBinary &component) = delete; + /** Prevent instances of this class from being copied */ + ClComponentElementwiseBinary &operator=(const ClComponentElementwiseBinary &component) = delete; + /** Allow instances of this class to be move constructed */ + ClComponentElementwiseBinary(ClComponentElementwiseBinary &&component) = default; + /** Allow instances of this class to be moved */ + ClComponentElementwiseBinary &operator=(ClComponentElementwiseBinary &&component) = default; + /** Get template writer for the component */ + const IGpuTemplateComponentWriter *template_writer() const override; + /** Get component type */ + GpuComponentType type() const override + { + return GpuComponentType::Simple; + } + +private: + std::unique_ptr _component_writer; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY */ diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp new file mode 100644 index 0000000000..46033d842b --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h" + +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" +#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" + +#include "src/common/utils/Log.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +Status GpuAdd::validate_op(const GpuWorkloadSketch &sketch, + const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst) +{ + ElementwiseBinaryCommonAttributes common_attributes{}; + common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::ADD); + return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, dst, common_attributes); +} + +Status GpuAdd::is_supported_op(const GpuWorkloadContext &context, + const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst) +{ + ElementwiseBinaryCommonAttributes common_attributes{}; + common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::ADD); + return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, dst, common_attributes); +} + +void GpuAdd::create_op(GpuWorkloadSketch &sketch, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst) +{ + // Assert validation + ARM_COMPUTE_ERROR_THROW_ON(GpuAdd::validate_op(sketch, lhs, rhs, dst)); + ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst); + + // Set the elementwise operation to ADD then call the elementwise common create_op + ElementwiseBinaryCommonAttributes common_attributes{}; + common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::ADD); + GpuElementwiseBinaryCommon::create_op(sketch, lhs, rhs, dst, common_attributes); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp index 12aa4d1b9f..9cb4ee7815 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp @@ -23,18 +23,17 @@ */ #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h" -#include "arm_compute/core/CL/CLCompileContext.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" -#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" +#include "src/common/utils/Log.h" + namespace arm_compute { namespace experimental @@ -103,18 +102,6 @@ Status GpuConv2d::validate_op(const GpuWorkloadSketch &sketch, { ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id()); } - - // Perform fusion test - // Pack tensor infos - ArgumentPack tensors; - tensors.add_const_tensor(ACL_SRC_0, src); - tensors.add_const_tensor(ACL_SRC_1, wei); - tensors.add_const_tensor(ACL_SRC_2, bia); - tensors.add_const_tensor(ACL_DST_0, dst); - const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op), - "Operator fusion test failed. This operator cannot be fused into the workload"); - // Auto initialize dst tensor info TensorInfo dst_info_to_validate = *dst; const auto data_layout = src->data_layout(); @@ -128,6 +115,17 @@ Status GpuConv2d::validate_op(const GpuWorkloadSketch &sketch, auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(shape)); } + // Perform fusion test + // Pack tensor infos + ArgumentPack tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_const_tensor(ACL_SRC_1, wei); + tensors.add_const_tensor(ACL_SRC_2, bia); + tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate); + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op), + "Operator fusion test failed. This operator cannot be fused into the workload"); + // Check support level // Data type ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); @@ -176,6 +174,7 @@ void GpuConv2d::create_op(GpuWorkloadSketch &sketch, ITensorInfo *dst, const Conv2dAttributes &attributes) { + ARM_COMPUTE_LOG_PARAMS(src, wei, bia, dst, attributes); // Assert validation ARM_COMPUTE_ERROR_THROW_ON(GpuConv2d::validate_op(sketch, src, wei, bia, dst, attributes)); ARM_COMPUTE_ERROR_ON_NULLPTR(src, wei, dst); diff --git a/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp new file mode 100644 index 0000000000..073924947c --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *lhs, const ITensorInfo *rhs) +{ + if(dst->total_size() == 0U) + { + const std::pair broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*lhs, *rhs); + auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(broadcast_pair.first)); + } +} +GpuOperatorType operator_type = GpuOperatorType::Simple; +} + +ElementwiseBinaryCommonAttributes &ElementwiseBinaryCommonAttributes::operation(const ElementwiseBinaryCommonAttributes::ElementwiseOp &operation) +{ + _operation = operation; + return *this; +} + +ElementwiseBinaryCommonAttributes::ElementwiseOp ElementwiseBinaryCommonAttributes::operation() const +{ + return _operation; +} + +Status GpuElementwiseBinaryCommon::is_supported_op(const GpuWorkloadContext &context, + const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const ElementwiseBinaryCommonAttributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); + + // Auto initialize dst tensor info + TensorInfo dst_info_to_validate = *dst; + calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs); + + // Check components + if(context.gpu_language() == GpuLanguage::OpenCL) + { + const auto cl_compile_ctx = context.cl_compile_context(); + ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); + // Validate ElementwiseBinary Component + { + ArgumentPack arguments; + arguments.add_const_tensor(ACL_SRC_0, lhs); + arguments.add_const_tensor(ACL_SRC_1, rhs); + + // We needed to pass the original dst pointer for in-place detection, in case its shape is not empty + if(dst->tensor_shape().total_size() == 0) + { + arguments.add_const_tensor(ACL_DST_0, &dst_info_to_validate); + } + else + { + arguments.add_const_tensor(ACL_DST_0, dst); + } + ARM_COMPUTE_RETURN_ON_ERROR(ClComponentElementwiseBinary::validate(arguments, attributes)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language"); + } + + return Status{}; +} + +Status GpuElementwiseBinaryCommon::validate_op(const GpuWorkloadSketch &sketch, + const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const ElementwiseBinaryCommonAttributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); + ARM_COMPUTE_RETURN_ERROR_ON( + !lhs->has_valid_id() || !rhs->has_valid_id() || !dst->has_valid_id()); + + // Auto initialize dst tensor info + TensorInfo dst_info_to_validate = *dst; + calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs); + + // Perform fusion test + // Pack tensor infos + ArgumentPack tensors; + tensors.add_const_tensor(ACL_SRC_0, lhs); + tensors.add_const_tensor(ACL_SRC_1, rhs); + tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate); + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op), + "Operator fusion test failed. This operator cannot be fused into the workload"); + + // Check if configuration is supported, and passing the original dst for in-place detection + return is_supported_op(*sketch.gpu_context(), lhs, rhs, dst, attributes); +} + +void GpuElementwiseBinaryCommon::create_op(GpuWorkloadSketch &sketch, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst, + const ElementwiseBinaryCommonAttributes &attributes) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); + const bool in_place = (lhs == dst) || (rhs == dst); + static TensorInfo in_place_dst; + in_place_dst = in_place ? sketch.create_tensor_info(*lhs) : TensorInfo{}; + + // Auto initialize dst tensor + calculate_and_init_dst_if_empty(dst, lhs, rhs); + + // Translate into components and add to component graph + auto &comp_graph = sketch.implementation().component_graph(); + + const auto sketch_ctx = sketch.implementation().context(); + + if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + { + const auto cl_compile_ctx = sketch_ctx->cl_compile_context(); + ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr); + + // Add ElementwiseBinary Component + { + auto properties = IGpuKernelComponent::Properties(); + properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + + ArgumentPack arguments; + arguments.add_const_tensor(ACL_SRC_0, lhs); + arguments.add_const_tensor(ACL_SRC_1, rhs); + if(in_place) + { + arguments.add_const_tensor(ACL_DST_0, &in_place_dst); + } + else + { + arguments.add_const_tensor(ACL_DST_0, dst); + } + comp_graph.add_new_component(properties, arguments, attributes); + } + } + else + { + ARM_COMPUTE_ERROR("Unimplemented Gpu language"); + } + + // Set up fusion test by adding to the Operator Group + // Note this has to be performed after all the components have been successfully added to the component graph + + // Pack tensor infos + ArgumentPack tensors; + tensors.add_const_tensor(ACL_SRC_0, lhs); + tensors.add_const_tensor(ACL_SRC_1, rhs); + if(in_place) + { + tensors.add_const_tensor(ACL_DST_0, &in_place_dst); + } + else + { + tensors.add_tensor(ACL_DST_0, dst); + } + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + sketch.implementation().operator_group().add_operator(op); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h new file mode 100644 index 0000000000..ffae801e47 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_INTERNAL_GPUELEMENTWISEBINARYCOMMON +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_INTERNAL_GPUELEMENTWISEBINARYCOMMON + +#include "arm_compute/core/Error.h" +#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class ElementwiseBinaryCommonAttributes +{ +public: + enum class ElementwiseOp + { + ADD, /**< (x + y) */ + SUB, /**< (x - y) */ + DIV, /**< (x / y) */ + MIN, /**< Min(x, y) */ + MAX, /**< Max(x, y) */ + SQUARED_DIFF, /**< (x - y)^2 */ + POWER, /**< x ^ y */ + PRELU, /**< y*x if x < 0, x otherwise */ + }; + /** Set operation*/ + ElementwiseBinaryCommonAttributes &operation(const ElementwiseBinaryCommonAttributes::ElementwiseOp &operation); + /** Get operation*/ + ElementwiseOp operation() const; + +private: + ElementwiseOp _operation; /**< Elementwise operation */ +}; + +/** Forward declaration */ +class GpuWorkloadContext; +class GpuWorkloadSketch; + +/** Operator interface. */ +class GpuElementwiseBinaryCommon final +{ +public: + /** Create an operator and fuse it into the workload sketch. + * @note If @ref validate_op() fails, the creation also fails and may throw an error. + * @note If @ref validate_op() fails, @p sketch remains unchanged and valid. + * + * Valid data type configurations are checked at the operator level i.e. GpuAdd::validate_op(), GpuSub::validate_op(), ... etc. + * + * Valid data layouts: + * - Any + * + * @param[in,out] sketch Workload sketch into which the operator will be fused + * @param[in] lhs Left hand side tensor info. Data types supported: U8/S16/S32/F16/F32. + * @param[in] rhs Right hand side tensor info. Data types supported: U8/S16/S32/F16/F32. + * @param[out] dst Destination tensor info. Data types supported: U8/S16/S32/F16/F32. If an uninitialized ITensorInfo is passed in, it will be auto-initialized + * @param[in] attributes ElementwiseBinaryCommonAttributes containing the operator type: ADD, SUB, DIV, ... etc. + */ + static void create_op(GpuWorkloadSketch &sketch, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst, + const ElementwiseBinaryCommonAttributes &attributes); + /** Check if the operator configuration is supported, irrespective of fusion + * Similar to @ref GpuElementwiseBinaryCommon::create_op() + * + * @param[in] context Workload context within which the operator is running + * @param[in] lhs Left hand side tensor info. Data types supported: U8/S16/S32/F16/F32. + * @param[in] rhs Right hand side tensor info. Data types supported: U8/S16/S32/F16/F32. + * @param[in] dst Destination tensor info. Data types supported: U8/S16/S32/F16/F32. If an uninitialized ITensorInfo is passed in, it will be auto-initialized + * @param[in] attributes ElementwiseBinaryCommonAttributes containing the operator type: ADD, SUB, DIV, ... etc. + */ + static Status is_supported_op(const GpuWorkloadContext &context, + const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const ElementwiseBinaryCommonAttributes &attributes); + /** Validate the operator and check if it can be fused into the workload sketch. + * Similar to @ref GpuElementwiseBinaryCommon::create_op() + */ + static Status validate_op(const GpuWorkloadSketch &sketch, + const ITensorInfo *rhs, + const ITensorInfo *lhs, + const ITensorInfo *dst, + const ElementwiseBinaryCommonAttributes &attributes); +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_INTERNAL_GPUELEMENTWISEBINARYCOMMON */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h index c85ddf5a2c..328e942955 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h @@ -48,6 +48,9 @@ class IGpuTemplateComponentWriter public: using ComponentGroup = GpuKernelComponentGroup; + /**For now all kernel intermeditate/destination tensors are expected to be of type Tensor_4D_t_Buffer*/ + static constexpr GpuKernelArgumentInfo::Type common_tensor_type = GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; + public: /** Constructor * diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp index 7ad7dd69f0..75e812af9f 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp @@ -240,7 +240,7 @@ void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, c } vtable.declare_variable( _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + GpuKernelArgumentInfo(common_tensor_type), comp_group.is_intermediate_tensor(_dst), "dst"); } @@ -305,7 +305,7 @@ CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &c const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL); const DataType data_type = _src->data_type(); - /// NOTE: For now tile sizes (n0, m0, n0) are set by the execution window. This may change in the future + /// NOTE: For now tile sizes (n0, m0, k0) are set by the execution window. This may change in the future const auto root_window = comp_group.get_root_component()->template_writer()->get_window(); const unsigned int n0 = root_window.x().step(); const unsigned int m0 = root_window.y().step(); diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp new file mode 100644 index 0000000000..996bf15d01 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClTemplateElementwiseBinary.h" + +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h" + +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +constexpr unsigned int vector_size_byte_opencl = 16; + +ClTemplateElementwiseBinary::ClTemplateElementwiseBinary(ComponentId id, + const ArgumentPack &tensors, + const Attributes &attributes) + : IGpuTemplateComponentWriter{ id, tensors }, + _lhs{}, + _rhs{}, + _dst{}, + _attributes{ attributes } +{ + _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst); +} + +std::string ClTemplateElementwiseBinary::get_name() const +{ + return "elementwise_binary"; +} + +std::string ClTemplateElementwiseBinary::get_component_code(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + std::string code; + const bool is_broadcast = _lhs->tensor_shape() != _rhs->tensor_shape(); + const bool is_root = (comp_group.get_root_component()->id() == this->id()); + + if(is_root) + { + code = +R"_( + //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP --------------------- +)_" + // IN_0(LHS) {{lhs}} + // IN_1(RHS) {{rhs}} + // OUT(dst, accum) {{dst}} + // dst = lhs + rhs (mix-precision, broadcast, boundary aware) +R"_( + TILE({{DATA_TYPE}}, M0, N0, {{dst}}); + TILE(uint, M0, 1, g_dst_indirect_y); + { + TILE({{DATA_TYPE}}, M0, N0, lhs_tile); + TILE({{DATA_TYPE}}, M0, N0, rhs_tile); +)_" + // Assuming un-collapsed window +R"_( + {{lhs}}_offset_first_element_in_bytes += g_ind_2 * {{lhs}}_stride_z; + {{rhs}}_offset_first_element_in_bytes += g_ind_2 * {{rhs}}_stride_z; + + T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{lhs}}, g_ind_0, g_ind_1, 1, {{lhs}}_stride_y, lhs_tile); + T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{rhs}}_stride_y, rhs_tile); +)_"; + if(is_broadcast) + { + code += +R"_( + T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}}); +)_"; + } + else + { + code += +R"_( + T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}}); +)_"; + } + code += + // Calculate the destination indirect Y +R"_( + LOOP_UNROLLING(int, i, 0, 1, M0, + { + g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{dst}}_w * {{dst}}_h) - 1); + g_dst_indirect_y[i].v += g_ind_2 * (int)({{dst}}_w * {{dst}}_h); + }) + } + //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP --------------------- +)_"; + } + + else // non-root + { + code = +R"_( + //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP --------------------- +)_" + // IN_0/Out(Accumulator) {{acc}} + // IN_1(Operand) {{operand}} + // acc = operand + acc (mix-precision, broadcast, boundary aware) +R"_( + { + TILE(DATA_TYPE, M0, N0, operand_tile); + T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{operand}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{operand}}_stride_y, operand_tile); +)_"; + + if(is_broadcast) + { + code += +R"_( + T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, {{acc}}, operand_tile, {{acc}}); +)_"; + } + else + { + code += +R"_( + T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{acc}}, operand_tile, {{acc}}); +)_"; + } + code += +R"_( + } + //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP --------------------- +)_"; + } + + return code; +} + +void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +{ + vtable.declare_variable( + _lhs, + GpuKernelArgumentInfo(common_tensor_type), + comp_group.is_intermediate_tensor(_lhs), + "lhs"); + + vtable.declare_variable( + _rhs, + GpuKernelArgumentInfo(common_tensor_type), + comp_group.is_intermediate_tensor(_rhs), + "rhs"); + + vtable.declare_variable( + _dst, + GpuKernelArgumentInfo(common_tensor_type), + comp_group.is_intermediate_tensor(_dst), + "dst"); +} + +TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +{ + TagLUT lut{}; + const ITensorInfo *accumulator = _lhs; + const ITensorInfo *operand = _rhs; + + // Local build options + lut["meta_kernel_id"] = id(); + lut["DATA_TYPE"] = get_cl_type_from_data_type(_lhs->data_type()); + // Arguments and global shared variables + const bool is_root = (comp_group.get_root_component()->id() == this->id()); + if(is_root) + { + lut["lhs"] = vtable.get_variable(_lhs); + lut["rhs"] = vtable.get_variable(_rhs); + lut["dst"] = vtable.get_variable(_dst); + } + else + { + // Determine which tensor is the accumulator + if(comp_group.is_intermediate_tensor(_lhs)) + { + accumulator = _lhs; + operand = _rhs; + } + else if(comp_group.is_intermediate_tensor(_rhs)) + { + accumulator = _rhs; + operand = _lhs; + } + else + { + ARM_COMPUTE_ERROR("Invalid elementwise component linking"); + } + lut["acc"] = vtable.get_variable(accumulator); + lut["operand"] = vtable.get_variable(operand); + } + switch(_attributes.operation()) + { + case Attributes::ElementwiseOp::ADD: + lut["ELTWISE_OP"] = "ADD"; + break; + default: + ARM_COMPUTE_ERROR("Arithmetic Operation not supported"); + } + ARM_COMPUTE_ERROR_ON_MSG(detail::have_different_dimensions(accumulator->tensor_shape(), _dst->tensor_shape(), 0), "Only the operand can be broadcast to match the accumulator's shape"); + const bool is_broadcast = (operand->tensor_shape() != _dst->tensor_shape()); + + // Set broadcast parameters + // PRE: All tensors are broadcast-compatible + if(is_broadcast) + { + // Note that n0 maps to input tensor dimension 0, m0 maps to input dimensions 1 and 2 because of our collapse strategy + if(operand->dimension(0) == 1U && operand->dimension(1) == 1U && operand->dimension(2) == 1U) // Broadcast in X, Y, Z: collapsed rhs win [M0xN0] = [1x1] + { + lut["rhs_m0"] = "1"; + lut["rhs_n0"] = "1"; + lut["rhs_start_ind_1"] = "0"; + lut["rhs_start_ind_0"] = "0"; + } + else if(operand->dimension(1) == 1U && operand->dimension(2) == 1U) // Broadcast in Y and Z: collapsed rhs win [M0xN0] = [1xN] + { + lut["rhs_m0"] = "1"; + lut["rhs_n0"] = "N0"; + lut["rhs_start_ind_1"] = "0"; + lut["rhs_start_ind_0"] = "g_ind_0"; + } + else + { + ARM_COMPUTE_ERROR("Only support rhs broadcasting in all X, Y, Z dimensions, or just in Y and Z dimensions"); + } + } + else + { + lut["rhs_m0"] = "M0"; + lut["rhs_n0"] = "N0"; + lut["rhs_start_ind_1"] = "g_ind_1"; + lut["rhs_start_ind_0"] = "g_ind_0"; + } + return lut; +} + +CLBuildOptions ClTemplateElementwiseBinary::get_build_options(const ComponentGroup &comp_group) const +{ + CLBuildOptions build_opts{}; + /// NOTE: For now tile sizes (n0, m0) are set by the execution window. This may change in the future + const auto root_window = comp_group.get_root_component()->template_writer()->get_window(); + const unsigned int n0 = root_window.x().step(); + const unsigned int m0 = root_window.y().step(); + const unsigned int partial_store_n0 = _dst->dimension(0) % n0; + + build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_lhs->data_type())); + build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0)); + + return build_opts; +} + +std::string ClTemplateElementwiseBinary::get_config_id() const +{ + std::string config_id{}; + config_id += lower_string(string_from_data_type(_dst->data_type())); + config_id += "_"; + config_id += support::cpp11::to_string(_dst->dimension(0)); + config_id += "_"; + config_id += support::cpp11::to_string(_dst->dimension(1)); + config_id += "_"; + config_id += lower_string(string_from_data_layout(_dst->data_layout())); + + return config_id; +} + +std::set ClTemplateElementwiseBinary::get_headers_list() const +{ + return std::set{ "helpers.h", "tile_helpers.h" }; +} + +Window ClTemplateElementwiseBinary::get_window() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); + + TensorShape output_shape = _dst->tensor_shape(); + // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) and upper dimensions unchanged + // This is in line with the collapsing convention used by operators like Conv2d + output_shape.collapse(2U, 1U); + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); + Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); + + return win; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h new file mode 100644 index 0000000000..e69150f3e7 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEELEMENTWISEBINARY +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEELEMENTWISEBINARY + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class ClTemplateElementwiseBinary final : public IGpuTemplateComponentWriter +{ +public: + using Attributes = ClComponentElementwiseBinary::Attributes; + + /** Constructor + * + * Similar to @ref ClComponentElementwiseBinary::validate() + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the components + * @param[in] attributes Component attributes + */ + ClTemplateElementwiseBinary(ComponentId id, + const ArgumentPack &tensors, + const Attributes &attributes); + /** Prevent instances of this class from being copy constructed */ + ClTemplateElementwiseBinary(const ClTemplateElementwiseBinary &elementwise) = delete; + /** Prevent instances of this class from being copied */ + ClTemplateElementwiseBinary &operator=(const ClTemplateElementwiseBinary &elementwise) = delete; + /** Allow instances of this class to be move constructed */ + ClTemplateElementwiseBinary(ClTemplateElementwiseBinary &&elementwise) = default; + /** Allow instances of this class to be moved */ + ClTemplateElementwiseBinary &operator=(ClTemplateElementwiseBinary &&elementwise) = default; + + /** Generate kernel component name */ + std::string get_name() const override; + + /** Generate kernel component code template + * + * @param[in] comp_group Component group of which the component is a part of + * + * @return std::string Component code + */ + std::string get_component_code(const ComponentGroup &comp_group) const override; + + /** Declare all variables used by the component in the @p vtable + * + * @param[out] vtable Variable table + * @param[in] comp_group Component group of which the component is a part of + */ + void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override; + + /** Generate the tag look-up table used to instantiate the component code. + * + * @param[in] vtable Variable table + * @param[in] comp_group Component group of which the component is a part of + * + * @return TagLUT Tag lookup table + */ + TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override; + + /** Generate the build options used in the component + * + * @param[in] comp_group Component group of which the component is a part of + * + * @return CLBuildOptions Build options + */ + CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override; + + /** Generate the component config id string used for tuning */ + std::string get_config_id() const override; + + /** Generate the header list used in the component */ + std::set get_headers_list() const override; + + /** Generate the execution window for the component */ + Window get_window() const override; + +private: + const ITensorInfo *_lhs; + const ITensorInfo *_rhs; + const ITensorInfo *_dst; + Attributes _attributes; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEELEMENTWISEBINARY */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp index bffb467ebb..e4b662b3a8 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp @@ -61,7 +61,6 @@ std::string ClTemplateStore::get_component_code(const ComponentGroup &comp_group void ClTemplateStore::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - // ARM_COMPUTE_UNUSED(comp_group) vtable.declare_variable( _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), -- cgit v1.2.1