diff options
24 files changed, 1949 insertions, 81 deletions
diff --git a/Android.bp b/Android.bp index e79d9b2d47..20afbfc1d6 100644 --- a/Android.bp +++ b/Android.bp @@ -599,12 +599,16 @@ cc_library_static { "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp", "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp", "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp", + "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp", "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp", + "src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp", "src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp", "src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp", + "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp", "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp", "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp", "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp", + "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp", "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp", "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp", "src/gpu/cl/ClContext.cpp", diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h new file mode 100644 index 0000000000..df3177867f --- /dev/null +++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUADD +#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUADD + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/ITensorInfo.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +class GpuWorkloadContext; +class GpuWorkloadSketch; + +/** Operator interface. */ +class GpuAdd final +{ +public: + /** Create an operator and fuse it into the workload sketch. + * @note If @ref validate_op() fails, the creation also fails and may throw an error. + * @note If @ref validate_op() fails, @p sketch remains unchanged and valid. + * @note Batching is not supported yet + * + * Valid data type configurations: + * |lhs |rhs |dst | + * |:--------------|:--------------|:-------------| + * |F16 |F16 |F16 | + * |F32 |F32 |F32 | + * |S32 |S32 |S32 | + * |S16 |S16 |S16 | + * |U8 |U8 |U8 | + * + * Valid data layouts: + * - Any + * + * @param[in,out] sketch Workload sketch into which the operator will be fused + * @param[in] lhs Left hand side tensor info. Data types supported: U8/S16/S32/F16/F32. + * @param[in] rhs Right hand side tensor info. Data types supported: U8/S16/S32/F16/F32. + * @param[out] dst Destination tensor info. Data types supported: U8/S16/S32/F16/F32. If an uninitialized ITensorInfo is passed in, it will be auto-initialized + */ + static void create_op(GpuWorkloadSketch &sketch, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst); + /** Check if the operator configuration is supported, irrespective of fusion + * + * @param[in] context Workload context within which the operator is running + * @param[in] lhs Left hand side tensor info. Data types supported: U8/S16/S32/F16/F32. + * @param[in] rhs Right hand side tensor info. Data types supported: U8/S16/S32/F16/F32. + * @param[out] dst Destination tensor info. Data types supported: U8/S16/S32/F16/F32. If an uninitialized ITensorInfo is passed in, it will be auto-initialized + */ + static Status is_supported_op(const GpuWorkloadContext &context, + const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst); + /** Validate the operator and check if the its configuration is supported and if it can be fused into the workload sketch. + * Similar to @ref GpuAdd::create_op() + */ + static Status validate_op(const GpuWorkloadSketch &sketch, + const ITensorInfo *rhs, + const ITensorInfo *lhs, + const ITensorInfo *dst); +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUADD */ diff --git a/filelist.json b/filelist.json index d64d9e175d..42fd4182e3 100644 --- a/filelist.json +++ b/filelist.json @@ -2196,7 +2196,6 @@ "dynamic_fusion": [ "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp", "src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp", - "src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp", "src/dynamic_fusion/sketch/OperatorAttributes.cpp", "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp", @@ -2209,11 +2208,15 @@ "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp", "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp", "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp", + "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp", "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp", "src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp", + "src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp", + "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp", "src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp", "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp", "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp", + "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp", "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp", "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp", "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp" diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h index 01d49b5032..998bc9efb2 100644 --- a/src/core/CL/cl_kernels/tile_helpers.h +++ b/src/core/CL/cl_kernels/tile_helpers.h @@ -64,7 +64,7 @@ /** Tile object * A tile object is a 2D memory block and can be accessed using the following syntax: * -# a[m0].v = access the the vector at row "m0" (OpenCL vector) - * -# a[m0].s[x] = access the scalar element at row "m0" and column "n0" (scalar access) + * -# dst[m0].s[n0] = access the scalar element at row "m0" and column "n0" (scalar access) * * @param[in] DATA_TYPE Data type of the tile * @param[in] H Number of tile rows diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp index f14f66d1bd..36168d14f1 100644 --- a/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp +++ b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp @@ -92,7 +92,7 @@ private: { const auto t_id = tensor_info.id(); auto find_tensor_pair = _owned_tensors.find(t_id); - if(find_tensor_pair == _owned_tensors.end()) + if(find_tensor_pair != _owned_tensors.end()) { return find_tensor_pair->second.get(); } @@ -203,11 +203,22 @@ Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &c for(auto tensor : user_tensors) { const auto t_id = tensor->info()->id(); + if(tensor_map.find(t_id) != tensor_map.end()) { - return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids"); + // In case of elementwise in-place: give another Id to the In/Out tensor when passed again + std::vector<ITensorInfo::Id> ids; + for(auto &t : tensor_map) + { + ids.push_back(t.first); + } + ITensorInfo::Id new_id = *std::max_element(ids.begin(), ids.end()) + 1; + tensor_map[new_id] = tensor; + } + else + { + tensor_map[t_id] = tensor; } - tensor_map[t_id] = tensor; } for(const auto &data : aux_tensors.get_tensors()) { @@ -247,6 +258,7 @@ Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &c } } } + return Status{}; } diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp new file mode 100644 index 0000000000..a17d835ac6 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClComponentElementwiseBinary.h" + +#include "arm_compute/core/Validate.h" +#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h" +#include "src/core/CL/CLValidate.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +std::set<ElementwiseBinaryCommonAttributes::ElementwiseOp> supported_ops +{ + ElementwiseBinaryCommonAttributes::ElementwiseOp::ADD +}; +} + +Status ClComponentElementwiseBinary::validate(const ArgumentPack<ITensorInfo> &tensors, const ElementwiseBinaryCommonAttributes &attributes) +{ + const auto lhs = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto rhs = tensors.get_const_tensor(TensorType::ACL_SRC_1); + const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0); + + // Check operator type + ARM_COMPUTE_RETURN_ERROR_ON_MSG(supported_ops.find(attributes.operation()) == supported_ops.end(), "Provided Elementwise operation not supported."); + + // Check validity + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); + + //Check data type for different elementwise operators + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::S32, DataType::S16, DataType::U8); + + const bool rhs_in_place = (rhs == dst); + const bool lhs_in_place = (lhs == dst); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_in_place && lhs_in_place, "Both LHS and RHS cannot be in-place at same time for any elementwise operation."); + + // dst shape is correct + const TensorShape out_shape = TensorShape::broadcast_shape(lhs->tensor_shape(), rhs->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((!rhs_in_place && !lhs_in_place) && detail::have_different_dimensions(lhs->tensor_shape(), dst->tensor_shape(), 0), + "Only the rhs operand can be broadcast to match the accumulator's (lhs) shape"); + // Matching data type + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); + + // Matching data layout + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, rhs); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(lhs, dst); + + // Batching case not supported yet + const size_t idx_batch = get_data_layout_dimension_index(lhs->data_layout(), DataLayoutDimension::BATCHES); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs->tensor_shape()[idx_batch] != 1) || (rhs->tensor_shape()[idx_batch] != 1) || (dst->tensor_shape()[idx_batch] != 1), "Batching case not supported yet"); + + // All tensor infos are initialized + ARM_COMPUTE_RETURN_ERROR_ON(lhs->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(rhs->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); + + // Device requirements are met + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(lhs); + + return Status{}; +} + +ClComponentElementwiseBinary::ClComponentElementwiseBinary( + ComponentId id, + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes) + : IGpuKernelComponent{ id, properties, tensors }, + _component_writer{ std::make_unique<ClTemplateElementwiseBinary>(id, tensors, attributes) } +{ +} +ClComponentElementwiseBinary::~ClComponentElementwiseBinary() +{ +} +const IGpuTemplateComponentWriter *ClComponentElementwiseBinary::template_writer() const +{ + return _component_writer.get(); +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h new file mode 100644 index 0000000000..02e61019f4 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY + +#include "arm_compute/core/Error.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" +#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" + +namespace arm_compute +{ +/** Forward declaration */ +class ITensorInfo; +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +template <typename T> +class ArgumentPack; + +/** Forward declaration */ +class ClTemplateElementwiseBinary; + +class ClComponentElementwiseBinary final : public IGpuKernelComponent +{ +public: + /** Attributes are a set of backend-agnostic parameters that define what a component does */ + using Attributes = ElementwiseBinaryCommonAttributes; + +public: + /** Validate the component + * + * @param[in,out] tensors Tensor arguments to the component + * @param[in] attributes Component attributes + * + * @return Status Validation results + * + * Tensor argument names: + * - ACL_SRC_0: lhs + * - ACL_SRC_1: rhs + * - ACL_DST_0: dst + * + * Tensor argument constness: + * - ACL_SRC_0: Const + * - ACL_SRC_1: Const + * - ACL_DST_0: Const + * + * Valid data layouts: + * - All + * + * Valid data type configurations (for DIV FP32/FP16/S32 supported, for POWER only FP32/FP16 supported): + * |ACL_SRC_0 |ACL_SRC_1 |ACL_DST_0 | + * |:--------------|:--------------|:--------------| + * |F16 |F16 |F16 | + * |F32 |F32 |F32 | + * |S32 |S32 |S32 | + * |S16 |S16 |S16 | + * |U8 |U8 |U8 | + */ + static Status validate(const ArgumentPack<ITensorInfo> &tensors, const ElementwiseBinaryCommonAttributes &attributes); + + /** Constructor + * + * Similar to @ref ClComponentElementwiseBinary::validate() + */ + ClComponentElementwiseBinary( + ComponentId id, + const Properties &properties, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes); + + /** Destructor */ + ~ClComponentElementwiseBinary() override; + /** Prevent instances of this class from being copy constructed */ + ClComponentElementwiseBinary(const ClComponentElementwiseBinary &component) = delete; + /** Prevent instances of this class from being copied */ + ClComponentElementwiseBinary &operator=(const ClComponentElementwiseBinary &component) = delete; + /** Allow instances of this class to be move constructed */ + ClComponentElementwiseBinary(ClComponentElementwiseBinary &&component) = default; + /** Allow instances of this class to be moved */ + ClComponentElementwiseBinary &operator=(ClComponentElementwiseBinary &&component) = default; + /** Get template writer for the component */ + const IGpuTemplateComponentWriter *template_writer() const override; + /** Get component type */ + GpuComponentType type() const override + { + return GpuComponentType::Simple; + } + +private: + std::unique_ptr<ClTemplateElementwiseBinary> _component_writer; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTELEMENTWISEBINARY */ diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp new file mode 100644 index 0000000000..46033d842b --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h" + +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" +#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" + +#include "src/common/utils/Log.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +Status GpuAdd::validate_op(const GpuWorkloadSketch &sketch, + const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst) +{ + ElementwiseBinaryCommonAttributes common_attributes{}; + common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::ADD); + return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, dst, common_attributes); +} + +Status GpuAdd::is_supported_op(const GpuWorkloadContext &context, + const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst) +{ + ElementwiseBinaryCommonAttributes common_attributes{}; + common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::ADD); + return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, dst, common_attributes); +} + +void GpuAdd::create_op(GpuWorkloadSketch &sketch, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst) +{ + // Assert validation + ARM_COMPUTE_ERROR_THROW_ON(GpuAdd::validate_op(sketch, lhs, rhs, dst)); + ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst); + + // Set the elementwise operation to ADD then call the elementwise common create_op + ElementwiseBinaryCommonAttributes common_attributes{}; + common_attributes.operation(ElementwiseBinaryCommonAttributes::ElementwiseOp::ADD); + GpuElementwiseBinaryCommon::create_op(sketch, lhs, rhs, dst, common_attributes); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp index 12aa4d1b9f..9cb4ee7815 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp @@ -23,18 +23,17 @@ */ #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h" -#include "arm_compute/core/CL/CLCompileContext.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/experimental/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" -#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" +#include "src/common/utils/Log.h" + namespace arm_compute { namespace experimental @@ -103,18 +102,6 @@ Status GpuConv2d::validate_op(const GpuWorkloadSketch &sketch, { ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id()); } - - // Perform fusion test - // Pack tensor infos - ArgumentPack<ITensorInfo> tensors; - tensors.add_const_tensor(ACL_SRC_0, src); - tensors.add_const_tensor(ACL_SRC_1, wei); - tensors.add_const_tensor(ACL_SRC_2, bia); - tensors.add_const_tensor(ACL_DST_0, dst); - const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op), - "Operator fusion test failed. This operator cannot be fused into the workload"); - // Auto initialize dst tensor info TensorInfo dst_info_to_validate = *dst; const auto data_layout = src->data_layout(); @@ -128,6 +115,17 @@ Status GpuConv2d::validate_op(const GpuWorkloadSketch &sketch, auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(shape)); } + // Perform fusion test + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_const_tensor(ACL_SRC_1, wei); + tensors.add_const_tensor(ACL_SRC_2, bia); + tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate); + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op), + "Operator fusion test failed. This operator cannot be fused into the workload"); + // Check support level // Data type ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); @@ -176,6 +174,7 @@ void GpuConv2d::create_op(GpuWorkloadSketch &sketch, ITensorInfo *dst, const Conv2dAttributes &attributes) { + ARM_COMPUTE_LOG_PARAMS(src, wei, bia, dst, attributes); // Assert validation ARM_COMPUTE_ERROR_THROW_ON(GpuConv2d::validate_op(sketch, src, wei, bia, dst, attributes)); ARM_COMPUTE_ERROR_ON_NULLPTR(src, wei, dst); diff --git a/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp new file mode 100644 index 0000000000..073924947c --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *lhs, const ITensorInfo *rhs) +{ + if(dst->total_size() == 0U) + { + const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*lhs, *rhs); + auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(broadcast_pair.first)); + } +} +GpuOperatorType operator_type = GpuOperatorType::Simple; +} + +ElementwiseBinaryCommonAttributes &ElementwiseBinaryCommonAttributes::operation(const ElementwiseBinaryCommonAttributes::ElementwiseOp &operation) +{ + _operation = operation; + return *this; +} + +ElementwiseBinaryCommonAttributes::ElementwiseOp ElementwiseBinaryCommonAttributes::operation() const +{ + return _operation; +} + +Status GpuElementwiseBinaryCommon::is_supported_op(const GpuWorkloadContext &context, + const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const ElementwiseBinaryCommonAttributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); + + // Auto initialize dst tensor info + TensorInfo dst_info_to_validate = *dst; + calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs); + + // Check components + if(context.gpu_language() == GpuLanguage::OpenCL) + { + const auto cl_compile_ctx = context.cl_compile_context(); + ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); + // Validate ElementwiseBinary Component + { + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC_0, lhs); + arguments.add_const_tensor(ACL_SRC_1, rhs); + + // We needed to pass the original dst pointer for in-place detection, in case its shape is not empty + if(dst->tensor_shape().total_size() == 0) + { + arguments.add_const_tensor(ACL_DST_0, &dst_info_to_validate); + } + else + { + arguments.add_const_tensor(ACL_DST_0, dst); + } + ARM_COMPUTE_RETURN_ON_ERROR(ClComponentElementwiseBinary::validate(arguments, attributes)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language"); + } + + return Status{}; +} + +Status GpuElementwiseBinaryCommon::validate_op(const GpuWorkloadSketch &sketch, + const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const ElementwiseBinaryCommonAttributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); + ARM_COMPUTE_RETURN_ERROR_ON( + !lhs->has_valid_id() || !rhs->has_valid_id() || !dst->has_valid_id()); + + // Auto initialize dst tensor info + TensorInfo dst_info_to_validate = *dst; + calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs); + + // Perform fusion test + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, lhs); + tensors.add_const_tensor(ACL_SRC_1, rhs); + tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate); + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op), + "Operator fusion test failed. This operator cannot be fused into the workload"); + + // Check if configuration is supported, and passing the original dst for in-place detection + return is_supported_op(*sketch.gpu_context(), lhs, rhs, dst, attributes); +} + +void GpuElementwiseBinaryCommon::create_op(GpuWorkloadSketch &sketch, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst, + const ElementwiseBinaryCommonAttributes &attributes) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); + const bool in_place = (lhs == dst) || (rhs == dst); + static TensorInfo in_place_dst; + in_place_dst = in_place ? sketch.create_tensor_info(*lhs) : TensorInfo{}; + + // Auto initialize dst tensor + calculate_and_init_dst_if_empty(dst, lhs, rhs); + + // Translate into components and add to component graph + auto &comp_graph = sketch.implementation().component_graph(); + + const auto sketch_ctx = sketch.implementation().context(); + + if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + { + const auto cl_compile_ctx = sketch_ctx->cl_compile_context(); + ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr); + + // Add ElementwiseBinary Component + { + auto properties = IGpuKernelComponent::Properties(); + properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + + ArgumentPack<ITensorInfo> arguments; + arguments.add_const_tensor(ACL_SRC_0, lhs); + arguments.add_const_tensor(ACL_SRC_1, rhs); + if(in_place) + { + arguments.add_const_tensor(ACL_DST_0, &in_place_dst); + } + else + { + arguments.add_const_tensor(ACL_DST_0, dst); + } + comp_graph.add_new_component<ClComponentElementwiseBinary>(properties, arguments, attributes); + } + } + else + { + ARM_COMPUTE_ERROR("Unimplemented Gpu language"); + } + + // Set up fusion test by adding to the Operator Group + // Note this has to be performed after all the components have been successfully added to the component graph + + // Pack tensor infos + ArgumentPack<ITensorInfo> tensors; + tensors.add_const_tensor(ACL_SRC_0, lhs); + tensors.add_const_tensor(ACL_SRC_1, rhs); + if(in_place) + { + tensors.add_const_tensor(ACL_DST_0, &in_place_dst); + } + else + { + tensors.add_tensor(ACL_DST_0, dst); + } + const auto op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + sketch.implementation().operator_group().add_operator(op); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h new file mode 100644 index 0000000000..ffae801e47 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_INTERNAL_GPUELEMENTWISEBINARYCOMMON +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_INTERNAL_GPUELEMENTWISEBINARYCOMMON + +#include "arm_compute/core/Error.h" +#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class ElementwiseBinaryCommonAttributes +{ +public: + enum class ElementwiseOp + { + ADD, /**< (x + y) */ + SUB, /**< (x - y) */ + DIV, /**< (x / y) */ + MIN, /**< Min(x, y) */ + MAX, /**< Max(x, y) */ + SQUARED_DIFF, /**< (x - y)^2 */ + POWER, /**< x ^ y */ + PRELU, /**< y*x if x < 0, x otherwise */ + }; + /** Set operation*/ + ElementwiseBinaryCommonAttributes &operation(const ElementwiseBinaryCommonAttributes::ElementwiseOp &operation); + /** Get operation*/ + ElementwiseOp operation() const; + +private: + ElementwiseOp _operation; /**< Elementwise operation */ +}; + +/** Forward declaration */ +class GpuWorkloadContext; +class GpuWorkloadSketch; + +/** Operator interface. */ +class GpuElementwiseBinaryCommon final +{ +public: + /** Create an operator and fuse it into the workload sketch. + * @note If @ref validate_op() fails, the creation also fails and may throw an error. + * @note If @ref validate_op() fails, @p sketch remains unchanged and valid. + * + * Valid data type configurations are checked at the operator level i.e. GpuAdd::validate_op(), GpuSub::validate_op(), ... etc. + * + * Valid data layouts: + * - Any + * + * @param[in,out] sketch Workload sketch into which the operator will be fused + * @param[in] lhs Left hand side tensor info. Data types supported: U8/S16/S32/F16/F32. + * @param[in] rhs Right hand side tensor info. Data types supported: U8/S16/S32/F16/F32. + * @param[out] dst Destination tensor info. Data types supported: U8/S16/S32/F16/F32. If an uninitialized ITensorInfo is passed in, it will be auto-initialized + * @param[in] attributes ElementwiseBinaryCommonAttributes containing the operator type: ADD, SUB, DIV, ... etc. + */ + static void create_op(GpuWorkloadSketch &sketch, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst, + const ElementwiseBinaryCommonAttributes &attributes); + /** Check if the operator configuration is supported, irrespective of fusion + * Similar to @ref GpuElementwiseBinaryCommon::create_op() + * + * @param[in] context Workload context within which the operator is running + * @param[in] lhs Left hand side tensor info. Data types supported: U8/S16/S32/F16/F32. + * @param[in] rhs Right hand side tensor info. Data types supported: U8/S16/S32/F16/F32. + * @param[in] dst Destination tensor info. Data types supported: U8/S16/S32/F16/F32. If an uninitialized ITensorInfo is passed in, it will be auto-initialized + * @param[in] attributes ElementwiseBinaryCommonAttributes containing the operator type: ADD, SUB, DIV, ... etc. + */ + static Status is_supported_op(const GpuWorkloadContext &context, + const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const ElementwiseBinaryCommonAttributes &attributes); + /** Validate the operator and check if it can be fused into the workload sketch. + * Similar to @ref GpuElementwiseBinaryCommon::create_op() + */ + static Status validate_op(const GpuWorkloadSketch &sketch, + const ITensorInfo *rhs, + const ITensorInfo *lhs, + const ITensorInfo *dst, + const ElementwiseBinaryCommonAttributes &attributes); +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_INTERNAL_GPUELEMENTWISEBINARYCOMMON */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h index c85ddf5a2c..328e942955 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h +++ b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h @@ -48,6 +48,9 @@ class IGpuTemplateComponentWriter public: using ComponentGroup = GpuKernelComponentGroup; + /**For now all kernel intermeditate/destination tensors are expected to be of type Tensor_4D_t_Buffer*/ + static constexpr GpuKernelArgumentInfo::Type common_tensor_type = GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; + public: /** Constructor * diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp index 7ad7dd69f0..75e812af9f 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp @@ -240,7 +240,7 @@ void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, c } vtable.declare_variable( _dst, - GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + GpuKernelArgumentInfo(common_tensor_type), comp_group.is_intermediate_tensor(_dst), "dst"); } @@ -305,7 +305,7 @@ CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &c const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL); const DataType data_type = _src->data_type(); - /// NOTE: For now tile sizes (n0, m0, n0) are set by the execution window. This may change in the future + /// NOTE: For now tile sizes (n0, m0, k0) are set by the execution window. This may change in the future const auto root_window = comp_group.get_root_component()->template_writer()->get_window(); const unsigned int n0 = root_window.x().step(); const unsigned int m0 = root_window.y().step(); diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp new file mode 100644 index 0000000000..996bf15d01 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClTemplateElementwiseBinary.h" + +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h" + +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +constexpr unsigned int vector_size_byte_opencl = 16; + +ClTemplateElementwiseBinary::ClTemplateElementwiseBinary(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes) + : IGpuTemplateComponentWriter{ id, tensors }, + _lhs{}, + _rhs{}, + _dst{}, + _attributes{ attributes } +{ + _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst); +} + +std::string ClTemplateElementwiseBinary::get_name() const +{ + return "elementwise_binary"; +} + +std::string ClTemplateElementwiseBinary::get_component_code(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + std::string code; + const bool is_broadcast = _lhs->tensor_shape() != _rhs->tensor_shape(); + const bool is_root = (comp_group.get_root_component()->id() == this->id()); + + if(is_root) + { + code = +R"_( + //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP --------------------- +)_" + // IN_0(LHS) {{lhs}} + // IN_1(RHS) {{rhs}} + // OUT(dst, accum) {{dst}} + // dst = lhs + rhs (mix-precision, broadcast, boundary aware) +R"_( + TILE({{DATA_TYPE}}, M0, N0, {{dst}}); + TILE(uint, M0, 1, g_dst_indirect_y); + { + TILE({{DATA_TYPE}}, M0, N0, lhs_tile); + TILE({{DATA_TYPE}}, M0, N0, rhs_tile); +)_" + // Assuming un-collapsed window +R"_( + {{lhs}}_offset_first_element_in_bytes += g_ind_2 * {{lhs}}_stride_z; + {{rhs}}_offset_first_element_in_bytes += g_ind_2 * {{rhs}}_stride_z; + + T_LOAD({{DATA_TYPE}}, M0, N0, BUFFER, {{lhs}}, g_ind_0, g_ind_1, 1, {{lhs}}_stride_y, lhs_tile); + T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{rhs}}_stride_y, rhs_tile); +)_"; + if(is_broadcast) + { + code += +R"_( + T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}}); +)_"; + } + else + { + code += +R"_( + T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, lhs_tile, rhs_tile, {{dst}}); +)_"; + } + code += + // Calculate the destination indirect Y +R"_( + LOOP_UNROLLING(int, i, 0, 1, M0, + { + g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{dst}}_w * {{dst}}_h) - 1); + g_dst_indirect_y[i].v += g_ind_2 * (int)({{dst}}_w * {{dst}}_h); + }) + } + //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP --------------------- +)_"; + } + + else // non-root + { + code = +R"_( + //------------------ START KERNEL {{meta_kernel_id}} ELTWISE_OP --------------------- +)_" + // IN_0/Out(Accumulator) {{acc}} + // IN_1(Operand) {{operand}} + // acc = operand + acc (mix-precision, broadcast, boundary aware) +R"_( + { + TILE(DATA_TYPE, M0, N0, operand_tile); + T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{operand}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{operand}}_stride_y, operand_tile); +)_"; + + if(is_broadcast) + { + code += +R"_( + T_ELTWISE_BROADCAST_{{ELTWISE_OP}}_X({{DATA_TYPE}}, M0, N0, {{acc}}, operand_tile, {{acc}}); +)_"; + } + else + { + code += +R"_( + T_ELTWISE_{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{acc}}, operand_tile, {{acc}}); +)_"; + } + code += +R"_( + } + //------------------ END KERNEL {{meta_kernel_id}} ELTWISE_OP --------------------- +)_"; + } + + return code; +} + +void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +{ + vtable.declare_variable( + _lhs, + GpuKernelArgumentInfo(common_tensor_type), + comp_group.is_intermediate_tensor(_lhs), + "lhs"); + + vtable.declare_variable( + _rhs, + GpuKernelArgumentInfo(common_tensor_type), + comp_group.is_intermediate_tensor(_rhs), + "rhs"); + + vtable.declare_variable( + _dst, + GpuKernelArgumentInfo(common_tensor_type), + comp_group.is_intermediate_tensor(_dst), + "dst"); +} + +TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +{ + TagLUT lut{}; + const ITensorInfo *accumulator = _lhs; + const ITensorInfo *operand = _rhs; + + // Local build options + lut["meta_kernel_id"] = id(); + lut["DATA_TYPE"] = get_cl_type_from_data_type(_lhs->data_type()); + // Arguments and global shared variables + const bool is_root = (comp_group.get_root_component()->id() == this->id()); + if(is_root) + { + lut["lhs"] = vtable.get_variable(_lhs); + lut["rhs"] = vtable.get_variable(_rhs); + lut["dst"] = vtable.get_variable(_dst); + } + else + { + // Determine which tensor is the accumulator + if(comp_group.is_intermediate_tensor(_lhs)) + { + accumulator = _lhs; + operand = _rhs; + } + else if(comp_group.is_intermediate_tensor(_rhs)) + { + accumulator = _rhs; + operand = _lhs; + } + else + { + ARM_COMPUTE_ERROR("Invalid elementwise component linking"); + } + lut["acc"] = vtable.get_variable(accumulator); + lut["operand"] = vtable.get_variable(operand); + } + switch(_attributes.operation()) + { + case Attributes::ElementwiseOp::ADD: + lut["ELTWISE_OP"] = "ADD"; + break; + default: + ARM_COMPUTE_ERROR("Arithmetic Operation not supported"); + } + ARM_COMPUTE_ERROR_ON_MSG(detail::have_different_dimensions(accumulator->tensor_shape(), _dst->tensor_shape(), 0), "Only the operand can be broadcast to match the accumulator's shape"); + const bool is_broadcast = (operand->tensor_shape() != _dst->tensor_shape()); + + // Set broadcast parameters + // PRE: All tensors are broadcast-compatible + if(is_broadcast) + { + // Note that n0 maps to input tensor dimension 0, m0 maps to input dimensions 1 and 2 because of our collapse strategy + if(operand->dimension(0) == 1U && operand->dimension(1) == 1U && operand->dimension(2) == 1U) // Broadcast in X, Y, Z: collapsed rhs win [M0xN0] = [1x1] + { + lut["rhs_m0"] = "1"; + lut["rhs_n0"] = "1"; + lut["rhs_start_ind_1"] = "0"; + lut["rhs_start_ind_0"] = "0"; + } + else if(operand->dimension(1) == 1U && operand->dimension(2) == 1U) // Broadcast in Y and Z: collapsed rhs win [M0xN0] = [1xN] + { + lut["rhs_m0"] = "1"; + lut["rhs_n0"] = "N0"; + lut["rhs_start_ind_1"] = "0"; + lut["rhs_start_ind_0"] = "g_ind_0"; + } + else + { + ARM_COMPUTE_ERROR("Only support rhs broadcasting in all X, Y, Z dimensions, or just in Y and Z dimensions"); + } + } + else + { + lut["rhs_m0"] = "M0"; + lut["rhs_n0"] = "N0"; + lut["rhs_start_ind_1"] = "g_ind_1"; + lut["rhs_start_ind_0"] = "g_ind_0"; + } + return lut; +} + +CLBuildOptions ClTemplateElementwiseBinary::get_build_options(const ComponentGroup &comp_group) const +{ + CLBuildOptions build_opts{}; + /// NOTE: For now tile sizes (n0, m0) are set by the execution window. This may change in the future + const auto root_window = comp_group.get_root_component()->template_writer()->get_window(); + const unsigned int n0 = root_window.x().step(); + const unsigned int m0 = root_window.y().step(); + const unsigned int partial_store_n0 = _dst->dimension(0) % n0; + + build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_lhs->data_type())); + build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0)); + + return build_opts; +} + +std::string ClTemplateElementwiseBinary::get_config_id() const +{ + std::string config_id{}; + config_id += lower_string(string_from_data_type(_dst->data_type())); + config_id += "_"; + config_id += support::cpp11::to_string(_dst->dimension(0)); + config_id += "_"; + config_id += support::cpp11::to_string(_dst->dimension(1)); + config_id += "_"; + config_id += lower_string(string_from_data_layout(_dst->data_layout())); + + return config_id; +} + +std::set<std::string> ClTemplateElementwiseBinary::get_headers_list() const +{ + return std::set<std::string>{ "helpers.h", "tile_helpers.h" }; +} + +Window ClTemplateElementwiseBinary::get_window() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); + + TensorShape output_shape = _dst->tensor_shape(); + // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) and upper dimensions unchanged + // This is in line with the collapsing convention used by operators like Conv2d + output_shape.collapse(2U, 1U); + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0)); + Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration)); + + return win; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h new file mode 100644 index 0000000000..e69150f3e7 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEELEMENTWISEBINARY +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEELEMENTWISEBINARY + +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class ClTemplateElementwiseBinary final : public IGpuTemplateComponentWriter +{ +public: + using Attributes = ClComponentElementwiseBinary::Attributes; + + /** Constructor + * + * Similar to @ref ClComponentElementwiseBinary::validate() + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the components + * @param[in] attributes Component attributes + */ + ClTemplateElementwiseBinary(ComponentId id, + const ArgumentPack<ITensorInfo> &tensors, + const Attributes &attributes); + /** Prevent instances of this class from being copy constructed */ + ClTemplateElementwiseBinary(const ClTemplateElementwiseBinary &elementwise) = delete; + /** Prevent instances of this class from being copied */ + ClTemplateElementwiseBinary &operator=(const ClTemplateElementwiseBinary &elementwise) = delete; + /** Allow instances of this class to be move constructed */ + ClTemplateElementwiseBinary(ClTemplateElementwiseBinary &&elementwise) = default; + /** Allow instances of this class to be moved */ + ClTemplateElementwiseBinary &operator=(ClTemplateElementwiseBinary &&elementwise) = default; + + /** Generate kernel component name */ + std::string get_name() const override; + + /** Generate kernel component code template + * + * @param[in] comp_group Component group of which the component is a part of + * + * @return std::string Component code + */ + std::string get_component_code(const ComponentGroup &comp_group) const override; + + /** Declare all variables used by the component in the @p vtable + * + * @param[out] vtable Variable table + * @param[in] comp_group Component group of which the component is a part of + */ + void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override; + + /** Generate the tag look-up table used to instantiate the component code. + * + * @param[in] vtable Variable table + * @param[in] comp_group Component group of which the component is a part of + * + * @return TagLUT Tag lookup table + */ + TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override; + + /** Generate the build options used in the component + * + * @param[in] comp_group Component group of which the component is a part of + * + * @return CLBuildOptions Build options + */ + CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override; + + /** Generate the component config id string used for tuning */ + std::string get_config_id() const override; + + /** Generate the header list used in the component */ + std::set<std::string> get_headers_list() const override; + + /** Generate the execution window for the component */ + Window get_window() const override; + +private: + const ITensorInfo *_lhs; + const ITensorInfo *_rhs; + const ITensorInfo *_dst; + Attributes _attributes; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEELEMENTWISEBINARY */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp index bffb467ebb..e4b662b3a8 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp @@ -61,7 +61,6 @@ std::string ClTemplateStore::get_component_code(const ComponentGroup &comp_group void ClTemplateStore::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const { - // ARM_COMPUTE_UNUSED(comp_group) vtable.declare_variable( _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), diff --git a/tests/datasets/DynamicFusionDataset.h b/tests/datasets/DynamicFusionDataset.h new file mode 100644 index 0000000000..5a1453b9ab --- /dev/null +++ b/tests/datasets/DynamicFusionDataset.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef TESTS_DATASETS_DYNAMICFUSIONDATASET +#define TESTS_DATASETS_DYNAMICFUSIONDATASET + +#include "utils/TypePrinter.h" + +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +namespace test +{ +namespace datasets +{ +class DynamicFusionThreeInputs +{ +public: + using type = std::tuple<TensorShape, TensorShape, TensorShape>; + + struct iterator + { + iterator(std::vector<TensorShape>::const_iterator shape0_it, + std::vector<TensorShape>::const_iterator shape1_it, + std::vector<TensorShape>::const_iterator shape2_it) + : _shape0_it{ std::move(shape0_it) }, + _shape1_it{ std::move(shape1_it) }, + _shape2_it{ std::move(shape2_it) } + { + } + + std::string description() const + { + std::stringstream description; + description << "shape0=" << *_shape0_it << ":"; + description << "shape1=" << *_shape1_it << ":"; + description << "shape2=" << *_shape2_it << ":"; + + return description.str(); + } + + DynamicFusionThreeInputs::type operator*() const + { + return std::make_tuple(*_shape0_it, *_shape1_it, *_shape2_it); + } + + iterator &operator++() + { + ++_shape0_it; + ++_shape1_it; + ++_shape2_it; + + return *this; + } + + private: + std::vector<TensorShape>::const_iterator _shape0_it; + std::vector<TensorShape>::const_iterator _shape1_it; + std::vector<TensorShape>::const_iterator _shape2_it; + }; + + iterator begin() const + { + return iterator(_shape0_shapes.begin(), _shape1_shapes.begin(), _shape2_shapes.begin()); + } + + int size() const + { + return std::min(_shape0_shapes.size(), std::min(_shape1_shapes.size(), _shape2_shapes.size())); + } + + void add_config(TensorShape shape0, TensorShape shape1, TensorShape shape2) + { + _shape0_shapes.emplace_back(std::move(shape0)); + _shape1_shapes.emplace_back(std::move(shape1)); + _shape2_shapes.emplace_back(std::move(shape2)); + } + +protected: + DynamicFusionThreeInputs() = default; + DynamicFusionThreeInputs(DynamicFusionThreeInputs &&) = default; + +private: + std::vector<TensorShape> _shape0_shapes{}; + std::vector<TensorShape> _shape1_shapes{}; + std::vector<TensorShape> _shape2_shapes{}; +}; + +class DynamicFusionElementwiseBinaryTwoOpsSmallShapes final : public DynamicFusionThreeInputs +{ +public: + DynamicFusionElementwiseBinaryTwoOpsSmallShapes() + { + add_config(TensorShape{ 9U, 9U, 5U }, TensorShape{ 9U, 9U, 5U }, TensorShape{ 9U, 9U, 5U }); + add_config(TensorShape{ 9U, 9U, 5U }, TensorShape{ 1U, 1U, 1U } /* Broadcast in X, Y, Z*/, TensorShape{ 9U, 9U, 5U }); + add_config(TensorShape{ 27U, 13U, 2U }, TensorShape{ 27U, 1U, 1U } /* Broadcast in Y and Z*/, TensorShape{ 27U, 13U, 2U }); + add_config(TensorShape{ 27U, 13U, 2U }, TensorShape{ 27U, 13U, 2U }, TensorShape{ 27U, 1U, 1U } /* Broadcast in Y and Z*/); + } +}; + +} // namespace datasets +} // namespace test +} // namespace arm_compute +#endif /* TESTS_DATASETS_DYNAMICFUSIONDATASET */ diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h index e4277a981e..047457c99e 100644 --- a/tests/datasets/ShapeDatasets.h +++ b/tests/datasets/ShapeDatasets.h @@ -212,6 +212,25 @@ public: } }; +/** Data set containing small tensor shapes. */ +class SmallShapesNoBatches final : public ShapeDataset +{ +public: + SmallShapesNoBatches() + : ShapeDataset("Shape", + { + // Batch size 1 + TensorShape{ 3U, 11U }, + TensorShape{ 1U, 16U }, + TensorShape{ 27U, 13U, 7U }, + TensorShape{ 7U, 7U, 17U }, + TensorShape{ 33U, 13U, 2U }, + TensorShape{ 11U, 11U, 3U } + }) + { + } +}; + /** Data set containing pairs of tiny tensor shapes that are broadcast compatible. */ class TinyShapesBroadcast final : public framework::dataset::ZipDataset<ShapeDataset, ShapeDataset> { @@ -282,6 +301,44 @@ public: } }; +class TemporaryLimitedSmallShapesBroadcast final : public framework::dataset::ZipDataset<ShapeDataset, ShapeDataset> +{ +public: + TemporaryLimitedSmallShapesBroadcast() + : ZipDataset<ShapeDataset, ShapeDataset>( + ShapeDataset("Shape0", + { + TensorShape{ 9U, 9U, 5U }, + TensorShape{ 27U, 13U, 2U }, + }), + ShapeDataset("Shape1", + { + TensorShape{ 1U, 1U, 1U }, // Broadcast in X, Y, Z + TensorShape{ 27U, 1U, 1U }, // Broadcast in Y and Z + })) + { + } +}; + +class TemporaryLimitedLargeShapesBroadcast final : public framework::dataset::ZipDataset<ShapeDataset, ShapeDataset> +{ +public: + TemporaryLimitedLargeShapesBroadcast() + : ZipDataset<ShapeDataset, ShapeDataset>( + ShapeDataset("Shape0", + { + TensorShape{ 127U, 25U, 5U }, + TensorShape{ 485, 40U, 10U } + }), + ShapeDataset("Shape1", + { + TensorShape{ 1U, 1U, 1U }, // Broadcast in X, Y, Z + TensorShape{ 485U, 1U, 1U }, // Broadcast in Y, Z + })) + { + } +}; + /** Data set containing medium tensor shapes. */ class MediumShapes final : public ShapeDataset { @@ -359,6 +416,19 @@ public: } }; +/** Data set containing large tensor shapes. */ +class LargeShapesNoBatches final : public ShapeDataset +{ +public: + LargeShapesNoBatches() + : ShapeDataset("Shape", + { + TensorShape{ 582U, 131U, 2U }, + }) + { + } +}; + /** Data set containing pairs of large tensor shapes that are broadcast compatible. */ class LargeShapesBroadcast final : public framework::dataset::ZipDataset<ShapeDataset, ShapeDataset> { diff --git a/tests/validation/dynamic_fusion/gpu/Integration.cpp b/tests/validation/dynamic_fusion/gpu/Integration.cpp index 036f28b29f..0b81dac1f0 100644 --- a/tests/validation/dynamic_fusion/gpu/Integration.cpp +++ b/tests/validation/dynamic_fusion/gpu/Integration.cpp @@ -28,24 +28,14 @@ #include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h" #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h" -#include "arm_compute/runtime/CL/CLScheduler.h" - -#include "src/gpu/cl/operators/ClAdd.h" -#include "src/gpu/cl/operators/ClConv2d.h" #include "tests/CL/CLAccessor.h" -#include "tests/framework/Asserts.h" #include "tests/framework/Macros.h" #include "tests/validation/Validation.h" #include "tests/validation/dynamic_fusion/Utils.h" #include "tests/validation/reference/ConvolutionLayer.h" -#include "tests/validation/reference/ElementwiseOperations.h" #include "tests/validation/reference/Permute.h" -#ifdef ARM_COMPUTE_ASSERTS_ENABLED -#include "tests/SimpleTensorPrinter.h" -#endif /* ARM_COMPUTE_ASSERTS_ENABLED */ - using namespace arm_compute::experimental::dynamic_fusion; using namespace arm_compute::test::validation::utils; diff --git a/tests/validation/dynamic_fusion/gpu/cl/Add.cpp b/tests/validation/dynamic_fusion/gpu/cl/Add.cpp new file mode 100644 index 0000000000..3743fbb664 --- /dev/null +++ b/tests/validation/dynamic_fusion/gpu/cl/Add.cpp @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h" + +#include "tests/CL/CLAccessor.h" +#include "tests/framework/Fixture.h" +#include "tests/framework/Macros.h" +#include "tests/framework/datasets/Datasets.h" +#include "tests/validation/Validation.h" + +#include "tests/datasets/DynamicFusionDataset.h" +#include "tests/datasets/ShapeDatasets.h" +#include "tests/validation/fixtures/dynamic_fusion/gpu/cl/ElementwiseBinaryFixture.h" +#include "tests/validation/reference/ElementwiseOperations.h" + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +TEST_SUITE(CL) +TEST_SUITE(DYNAMIC_FUSION) +TEST_SUITE(ADD) + +// *INDENT-OFF* +// clang-format off +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( + framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), // Invalid data type combination + TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16), // S16 is valid data type for Add + TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32), // S32 is valid data type for Add + TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), // Mismatching shapes + TensorInfo(TensorShape(32U, 1U, 1U), 1, DataType::F32), // Broadcasting not allowed for lhs + TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32), // Batching not supported + }), + framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16), + TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16), + TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32), + TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 1U, 1U), 1, DataType::F32), // Broadcasting allowed for rhs + TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32), // Batching not supported + })), + framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16), + TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S32), + TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U, 2U, 2), 1, DataType::F32), + })), + framework::dataset::make("Expected", { true, false, true, true, false, false, true, false})), + input1_info, input2_info, output_info, expected) +{ + // Create a new workload sketch + auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); + auto gpu_ctx = GpuWorkloadContext{ &cl_compile_ctx }; + GpuWorkloadSketch sketch{ &gpu_ctx }; + + // Fuse Elementwise Add + auto lhs_info = sketch.create_tensor_info(input1_info); + auto rhs_info = sketch.create_tensor_info(input2_info); + auto dst_info = sketch.create_tensor_info(output_info); + bool res = bool(GpuAdd::validate_op(sketch, &lhs_info, &rhs_info, &dst_info)); + ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS); +} + +DATA_TEST_CASE(ValidateRhsInplace, framework::DatasetMode::ALL, zip(zip( + framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 1U, 1U), 1, DataType::F32), // Broadcasting allowed for lhs + TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), + }), + framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 1U, 1U), 1, DataType::F32), // Broadcasting not allowed for rhs + })), + framework::dataset::make("Expected", { true, false})), + input1_info, input2_info, expected) +{ + // Create a new workload sketch + auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); + auto gpu_ctx = GpuWorkloadContext{ &cl_compile_ctx }; + GpuWorkloadSketch sketch{ &gpu_ctx }; + + // Fuse Elementwise Add + auto lhs_info = sketch.create_tensor_info(input1_info); + auto rhs_info = sketch.create_tensor_info(input2_info); + bool res = bool(GpuAdd::validate_op(sketch, &lhs_info, &rhs_info, &rhs_info)); + ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS); +} + +DATA_TEST_CASE(ValidateLhsInplace, framework::DatasetMode::ALL, zip(zip( + framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 1U, 1U), 1, DataType::F32), // Broadcasting not allowed for lhs + TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), + }), + framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 1U, 1U), 1, DataType::F32), // Broadcasting allowed for rhs + })), + framework::dataset::make("Expected", { false, true})), + input1_info, input2_info, expected) +{ + // Create a new workload sketch + auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); + auto gpu_ctx = GpuWorkloadContext{ &cl_compile_ctx }; + GpuWorkloadSketch sketch{ &gpu_ctx }; + + // Fuse Elementwise Add + auto lhs_info = sketch.create_tensor_info(input1_info); + auto rhs_info = sketch.create_tensor_info(input2_info); + bool res = bool(GpuAdd::validate_op(sketch, &lhs_info, &rhs_info, &lhs_info)); + ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS); +} +// clang-format on +// *INDENT-ON* + +RelativeTolerance<float> tolerance_f32(0.01f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */ +RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.1)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */ +constexpr float tolerance_num = 0.01f; /**< Tolerance number */ + +template <typename T> +using DynamicFusionAddOpFixture = DynamicFusionGpuElementwiseBinaryOneOpValidationFixture<CLTensor, CLAccessor, GpuAdd, T>; + +template <typename T> +using DynamicFusionAddOpBroadcastFixture = DynamicFusionGpuElementwiseBinaryBroadcastOneOpValidationFixture<CLTensor, CLAccessor, GpuAdd, T>; + +template <typename T> +using DynamicFusionGpuFuseTwoAddOpsFixture = DynamicFusionGpuElementwiseBinaryTwoOpsValidationFixture<CLTensor, CLAccessor, GpuAdd, T>; + +TEST_SUITE(FP32) +FIXTURE_DATA_TEST_CASE(RunSmallOneOp, DynamicFusionAddOpFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine( + framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }), + datasets::SmallShapesNoBatches()), + framework::dataset::make("DataType", { DataType::F32 })), + framework::dataset::make("InPlace", { false, true }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32); +} +FIXTURE_DATA_TEST_CASE(RunLargeOneOp, DynamicFusionAddOpFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine( + framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }), + datasets::LargeShapesNoBatches()), + framework::dataset::make("DataType", { DataType::F32 })), + framework::dataset::make("InPlace", { false, true }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32); +} +FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp, DynamicFusionAddOpBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }), + datasets::TemporaryLimitedSmallShapesBroadcast()), + framework::dataset::make("DataType", { DataType::F32 })), + framework::dataset::make("InPlace", { false, true }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32); +} + +FIXTURE_DATA_TEST_CASE(RunLargeBroadcastOneOp, DynamicFusionAddOpBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }), + datasets::TemporaryLimitedLargeShapesBroadcast()), + framework::dataset::make("DataType", { DataType::F32 })), + framework::dataset::make("InPlace", { false, true }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32); +} +FIXTURE_DATA_TEST_CASE(RunSmallTwoOps, DynamicFusionGpuFuseTwoAddOpsFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }), + datasets::DynamicFusionElementwiseBinaryTwoOpsSmallShapes()), + framework::dataset::make("DataType", { DataType::F32 })), + framework::dataset::make("InPlace", { false }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32); +} +TEST_SUITE_END() // FP32 + +TEST_SUITE(FP16) +FIXTURE_DATA_TEST_CASE(RunSmallOneOp, DynamicFusionAddOpFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }), + datasets::SmallShapesNoBatches()), + framework::dataset::make("DataType", { DataType::F16 })), + framework::dataset::make("InPlace", { false, true }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32, tolerance_num); +} + +FIXTURE_DATA_TEST_CASE(RunSmallBroadcastOneOp, DynamicFusionAddOpBroadcastFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }), + datasets::TemporaryLimitedSmallShapesBroadcast()), + framework::dataset::make("DataType", { DataType::F16 })), + framework::dataset::make("InPlace", { false }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f32, tolerance_num); +} + +TEST_SUITE_END() // FP16 + +TEST_SUITE(S32) +FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionAddOpFixture<int32_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }), + datasets::SmallShapesNoBatches()), + framework::dataset::make("DataType", { DataType::S32 })), + framework::dataset::make("InPlace", { false }))) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} +TEST_SUITE_END() // S32 + +TEST_SUITE(S16) +FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionAddOpFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }), + datasets::SmallShapesNoBatches()), + framework::dataset::make("DataType", { DataType::S16 })), + framework::dataset::make("InPlace", { false }))) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} +FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionAddOpFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }), + datasets::LargeShapesNoBatches()), + framework::dataset::make("DataType", { DataType::S16 })), + framework::dataset::make("InPlace", { false }))) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} +TEST_SUITE_END() // S16 + +TEST_SUITE(U8) +FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionAddOpFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(framework::dataset::make("ElementwiseOp", { ArithmeticOperation::ADD }), + datasets::SmallShapesNoBatches()), + framework::dataset::make("DataType", { DataType::U8 })), + framework::dataset::make("InPlace", { false }))) +{ + // Validate output + validate(CLAccessor(_target), _reference); +} +TEST_SUITE_END() // U8 + +TEST_SUITE_END() // ADD +TEST_SUITE_END() // DYNAMIC_FUSION +TEST_SUITE_END() // CL +} // namespace validation +} // namespace test +} // namespace arm_compute diff --git a/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp index 1f9319b10f..bfb9735599 100644 --- a/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp +++ b/tests/validation/dynamic_fusion/gpu/cl/DirectConv2d.cpp @@ -22,21 +22,8 @@ * SOFTWARE. */ -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" - -#include "arm_compute/core/CL/CLKernelLibrary.h" -#include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h" -#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h" -#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" -#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h" - #include "tests/AssetsLibrary.h" #include "tests/CL/CLAccessor.h" -#include "tests/Globals.h" -#include "tests/IAccessor.h" -#include "tests/framework/Asserts.h" #include "tests/framework/Fixture.h" #include "tests/framework/Macros.h" #include "tests/framework/datasets/Datasets.h" @@ -46,12 +33,6 @@ #include "tests/datasets/SmallConvolutionLayerDataset.h" #include "tests/validation/fixtures/dynamic_fusion/gpu/cl/DirectConv2dFixture.h" -#ifdef ARM_COMPUTE_ASSERTS_ENABLED -#include "tests/SimpleTensorPrinter.h" -#endif /* ARM_COMPUTE_ASSERTS_ENABLED */ -#include "tests/framework/Asserts.h" -#include "tests/framework/Macros.h" -#include "tests/validation/Validation.h" namespace arm_compute { namespace test @@ -60,7 +41,7 @@ namespace validation { TEST_SUITE(CL) TEST_SUITE(DYNAMIC_FUSION) -TEST_SUITE(GPU_CONV2D) +TEST_SUITE(CONV2D) RelativeTolerance<float> tolerance_f32(0.01f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */ RelativeTolerance<half_float::half> tolerance_f16(half_float::half(0.1)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */ @@ -79,7 +60,6 @@ FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuConv2dFixture<float>, framework } TEST_SUITE_END() // FP32 -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC TEST_SUITE(FP16) FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuConv2dFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallConvolutionLayerDataset(), framework::dataset::make("DataType", DataType::F16)), @@ -90,9 +70,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuConv2dFixture<half>, framework: validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num); } TEST_SUITE_END() // FP16 -#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -TEST_SUITE_END() // GPU_CONV2D +TEST_SUITE_END() // CONV2D TEST_SUITE_END() // DYNAMIC_FUSION TEST_SUITE_END() // CL } // namespace validation diff --git a/tests/validation/fixtures/dynamic_fusion/gpu/cl/DirectConv2dFixture.h b/tests/validation/fixtures/dynamic_fusion/gpu/cl/DirectConv2dFixture.h index b0522488b4..e437c440d0 100644 --- a/tests/validation/fixtures/dynamic_fusion/gpu/cl/DirectConv2dFixture.h +++ b/tests/validation/fixtures/dynamic_fusion/gpu/cl/DirectConv2dFixture.h @@ -21,32 +21,23 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_TEST_DYNAMIC_FUSION_FIXTURE -#define ARM_COMPUTE_TEST_DYNAMIC_FUSION_FIXTURE +#ifndef TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_DIRECTCONV2DFIXTURE +#define TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_DIRECTCONV2DFIXTURE #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/runtime/CL/CLScheduler.h" - #include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h" #include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h" #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h" -#include "src/gpu/cl/operators/ClAdd.h" -#include "src/gpu/cl/operators/ClConv2d.h" - #include "tests/CL/CLAccessor.h" - -#include "tests/framework/Asserts.h" #include "tests/framework/Fixture.h" #include "tests/framework/Macros.h" - #include "tests/validation/Validation.h" #include "tests/validation/reference/ConvolutionLayer.h" -#include "tests/validation/reference/ElementwiseOperations.h" #include "tests/validation/reference/Permute.h" using namespace arm_compute::experimental::dynamic_fusion; @@ -136,10 +127,10 @@ protected: tensor->allocator()->allocate(); // Use ACL allocated memory } // Construct user tensors - CLTensor t_input{}; - CLTensor t_weight{}; - CLTensor t_bias{}; - CLTensor t_dst{}; + TensorType t_input{}; + TensorType t_weight{}; + TensorType t_bias{}; + TensorType t_dst{}; // Initialize user tensors t_input.allocator()->init(input_info); @@ -152,9 +143,10 @@ protected: t_weight.allocator()->allocate(); t_bias.allocator()->allocate(); t_dst.allocator()->allocate(); - fill(CLAccessor(t_input), 0); - fill(CLAccessor(t_weight), 1); - fill(CLAccessor(t_bias), 2); + + fill(AccessorType(t_input), 0); + fill(AccessorType(t_weight), 1); + fill(AccessorType(t_bias), 2); // Run runtime runtime.run({ &t_input, &t_weight, &t_bias, &t_dst }); @@ -187,15 +179,11 @@ protected: TensorType _target{}; SimpleTensor<T> _reference{}; DataType _data_type{}; - DataType _weights_data_type{}; DataType _bias_data_type{}; - DataType _output_data_type{}; DataLayout _data_layout{}; QuantizationInfo _quantization_info{}; QuantizationInfo _weight_quantization_info{}; bool _is_quantized = false; - bool _is_bfloat16 = false; - bool _mixed_layout = false; }; template <typename TensorType, typename AccessorType, typename FunctionType, typename T> @@ -207,10 +195,10 @@ public: const PadStrideInfo &info, const Size2D &dialation, DataType data_type, DataLayout data_layout, QuantizationInfo quantization_info) { DynamicFusionGpuConv2dValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(input_shape, weights_shape, output_shape, bias_shape, info, dialation, - data_type, data_layout, quantization_info, quantization_info); + data_type, data_layout, quantization_info, quantization_info); } }; } // namespace validation } // namespace test } // namespace arm_compute -#endif /* ARM_COMPUTE_TEST_DYNAMIC_FUSION_FIXTURE */ +#endif /* TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_DIRECTCONV2DFIXTURE */ diff --git a/tests/validation/fixtures/dynamic_fusion/gpu/cl/ElementwiseBinaryFixture.h b/tests/validation/fixtures/dynamic_fusion/gpu/cl/ElementwiseBinaryFixture.h new file mode 100644 index 0000000000..d11237748f --- /dev/null +++ b/tests/validation/fixtures/dynamic_fusion/gpu/cl/ElementwiseBinaryFixture.h @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_ELEMENTWISEBINARYFIXTURE +#define TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_ELEMENTWISEBINARYFIXTURE + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h" +#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" + +#include "tests/CL/CLAccessor.h" +#include "tests/framework/Fixture.h" +#include "tests/framework/Macros.h" +#include "tests/validation/Validation.h" +#include "tests/validation/reference/ElementwiseOperations.h" +#include "tests/validation/reference/Permute.h" + +using namespace arm_compute::experimental::dynamic_fusion; + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +template <typename TensorType, typename AccessorType, typename FunctionType, typename T> +class DynamicFusionGpuElementwiseBinaryValidationGenericFixture : public framework::Fixture +{ +public: + template <typename...> + void setup(ArithmeticOperation op, TensorShape shape0, TensorShape shape1, TensorShape shape2, const DataType data_type, const bool is_inplace) + { + _op = op; + _is_inplace = is_inplace; + _data_type = data_type; + _fuse = shape2.total_size() != 0; + ARM_COMPUTE_ERROR_ON_MSG(_fuse && _is_inplace, "In place for fusing case not supported yet."); + _target = compute_target(shape0, shape1, shape2); + _reference = compute_reference(shape0, shape1, shape2); + } + +protected: + template <typename U> + void fill(U &&tensor, int i) + { + if(is_data_type_float(tensor.data_type())) + { + switch(_op) + { + case ArithmeticOperation::DIV: + library->fill_tensor_uniform_ranged(tensor, i, { std::pair<float, float>(-0.001f, 0.001f) }); + break; + case ArithmeticOperation::POWER: + library->fill_tensor_uniform(tensor, i, 0.0f, 5.0f); + break; + default: + library->fill_tensor_uniform(tensor, i); + } + } + else if(tensor.data_type() == DataType::S32) + { + switch(_op) + { + case ArithmeticOperation::DIV: + library->fill_tensor_uniform_ranged(tensor, i, { std::pair<int32_t, int32_t>(-1U, 1U) }); + break; + default: + library->fill_tensor_uniform(tensor, i); + } + } + else + { + library->fill_tensor_uniform(tensor, i); + } + } + + TensorType compute_target(TensorShape shape0, TensorShape shape1, TensorShape shape2) + { + // Create a new workload sketch + auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); + auto gpu_ctx = GpuWorkloadContext{ &cl_compile_ctx }; + GpuWorkloadSketch sketch{ &gpu_ctx }; + TensorInfo dst_info{}; + TensorInfo dst_info_fuse{}; + + // Fuse first element wise binary Op + auto lhs_info = sketch.create_tensor_info(shape0, 1, _data_type); + auto rhs_info = sketch.create_tensor_info(TensorInfo(shape1, 1, _data_type)); + TensorInfo rhs_info_fuse; + + // Testing root case while in-place + if(!_is_inplace) + { + dst_info = sketch.create_tensor_info(TensorInfo(1, _data_type)); + + FunctionType::create_op(sketch, &lhs_info, &rhs_info, &dst_info); + } + else + { + FunctionType::create_op(sketch, &lhs_info, &rhs_info, &lhs_info); + } + + if(_fuse) + { + // Fuse first element wise binary Op + rhs_info_fuse = sketch.create_tensor_info(TensorInfo(shape2, 1, _data_type)); + dst_info_fuse = sketch.create_tensor_info(); + FunctionType::create_op(sketch, &dst_info, &rhs_info_fuse, &dst_info_fuse); + } + + // Configure runtime + ClWorkloadRuntime runtime; + runtime.configure(sketch); + + // (Important) Allocate auxiliary tensor memory if there are any + for(auto &data : runtime.get_auxiliary_tensors()) + { + TensorType *tensor = data.first; + AuxMemoryInfo aux_mem_req = data.second; + tensor->allocator()->init(*data.first->info(), aux_mem_req.alignment); + tensor->allocator()->allocate(); + } + + // Construct user tensors + TensorType t_lhs{}; + TensorType t_rhs{}; + TensorType t_rhs_fuse{}; + TensorType t_dst{}; + TensorType t_dst_fuse{}; + + // Initialize user tensors + t_lhs.allocator()->init(lhs_info); + t_rhs.allocator()->init(rhs_info); + if(!_is_inplace) + { + t_dst.allocator()->init(dst_info); + if(_fuse) + { + t_rhs_fuse.allocator()->init(rhs_info_fuse); + t_dst_fuse.allocator()->init(dst_info_fuse); + } + } + + // Allocate and fill user tensors + // Instead of using ACL allocator, the user can choose to import memory into the tensors + t_lhs.allocator()->allocate(); + t_rhs.allocator()->allocate(); + if(!_is_inplace) + { + t_dst.allocator()->allocate(); + if(_fuse) + { + t_rhs_fuse.allocator()->allocate(); + t_dst_fuse.allocator()->allocate(); + } + } + + fill(AccessorType(t_lhs), 0); + fill(AccessorType(t_rhs), 1); + if(_fuse) + { + fill(AccessorType(t_rhs_fuse), 2); + } + // Run runtime + if(_is_inplace) + { + runtime.run({ &t_lhs, &t_rhs, &t_lhs }); + } + else + { + if(_fuse) + { + runtime.run({ &t_lhs, &t_rhs, &t_rhs_fuse, &t_dst_fuse }); + } + else + { + runtime.run({ &t_lhs, &t_rhs, &t_dst }); + } + } + + if(_is_inplace) + { + return t_lhs; + } + else if(_fuse) + { + return t_dst_fuse; + } + return t_dst; + } + + SimpleTensor<T> compute_reference(TensorShape shape0, TensorShape shape1, TensorShape shape2) + { + const TensorShape out_shape = TensorShape::broadcast_shape(shape0, shape1); + const TensorShape out_shape_fuse = TensorShape::broadcast_shape(out_shape, shape1); + + // Create reference + SimpleTensor<T> ref_lhs{ shape0, _data_type, 1, QuantizationInfo() }; + SimpleTensor<T> ref_rhs{ shape1, _data_type, 1, QuantizationInfo() }; + SimpleTensor<T> ref_rhs_fuse{ shape2, _data_type, 1, QuantizationInfo() }; + SimpleTensor<T> ref_dst{ out_shape, _data_type, 1, QuantizationInfo() }; + SimpleTensor<T> ref_dst_fuse{ out_shape_fuse, _data_type, 1, QuantizationInfo() }; + // Fill reference + fill(ref_lhs, 0); + fill(ref_rhs, 1); + + reference::arithmetic_operation<T>(_op, ref_lhs, ref_rhs, ref_dst, ConvertPolicy::WRAP); + if(_fuse) + { + fill(ref_rhs_fuse, 2); + reference::arithmetic_operation<T>(_op, ref_dst, ref_rhs_fuse, ref_dst_fuse, ConvertPolicy::WRAP); + } + SimpleTensor<T> *ret = _fuse ? &ref_dst_fuse : &ref_dst; + return *ret; + } + + ArithmeticOperation _op{ ArithmeticOperation::ADD }; + TensorType _target{}; + SimpleTensor<T> _reference{}; + DataType _data_type{}; + DataLayout _data_layout{}; + bool _is_inplace{ false }; + bool _fuse{ false }; +}; + +template <typename TensorType, typename AccessorType, typename FunctionType, typename T> +class DynamicFusionGpuElementwiseBinaryOneOpValidationFixture : public DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T> +{ +public: + template <typename...> + void setup(ArithmeticOperation op, TensorShape shape, const DataType data_type, const bool is_inplace) + { + DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(op, shape, shape, TensorShape(), data_type, is_inplace); + } +}; + +template <typename TensorType, typename AccessorType, typename FunctionType, typename T> +class DynamicFusionGpuElementwiseBinaryBroadcastOneOpValidationFixture : public DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T> +{ +public: + template <typename...> + void setup(ArithmeticOperation op, TensorShape shape0, TensorShape shape1, const DataType data_type, const bool is_inplace) + { + DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(op, shape0, shape1, TensorShape(), data_type, is_inplace); + } +}; + +template <typename TensorType, typename AccessorType, typename FunctionType, typename T> +class DynamicFusionGpuElementwiseBinaryTwoOpsValidationFixture : public DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T> +{ +public: + template <typename...> + void setup(ArithmeticOperation op, TensorShape shape0, TensorShape shape1, TensorShape shape2, const DataType data_type, const bool is_inplace) + { + DynamicFusionGpuElementwiseBinaryValidationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(op, shape0, shape1, shape2, data_type, is_inplace); + } +}; + +} // namespace validation +} // namespace test +} // namespace arm_compute +#endif /* TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_ELEMENTWISEBINARYFIXTURE */ diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h index 2ca7ab9b7f..0122229ed2 100644 --- a/utils/TypePrinter.h +++ b/utils/TypePrinter.h @@ -1874,6 +1874,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const ArithmeticOperation case ArithmeticOperation::POWER: os << "POWER"; break; + case ArithmeticOperation::PRELU: + os << "PRELU"; + break; default: ARM_COMPUTE_ERROR("NOT_SUPPORTED!"); } @@ -3413,7 +3416,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const experimental::dynami << "[" << "Padding=" << conv2d_attr.pad() << ", " << "Size2D=" << conv2d_attr.stride() << ", " - << "Dilation=" << conv2d_attr.dilation() << "]"; + << "Dialation=" << conv2d_attr.dilation() << "]"; return os; } |