From 7dc0234331f2150a6b4ac5c2b49de419870f7cf5 Mon Sep 17 00:00:00 2001 From: Gunes Bayir Date: Mon, 21 Nov 2022 21:46:50 +0000 Subject: Implement FP32/16 Depthwise Conv2d operator in dynamic fusion This patch adds Depthwise Conv2d operator into dynamic fusion interface and adds the associated tests. Resolves: COMPMID-5517 Change-Id: I385c94dff7fd40c72b8337ef797e508df4499a82 Signed-off-by: Gunes Bayir Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8678 Tested-by: Arm Jenkins Reviewed-by: SiCong Li Reviewed-by: Gian Marco Iodice Benchmark: Arm Jenkins --- Android.bp | 4 + .../sketch/attributes/DepthwiseConv2dAttributes.h | 75 ++++ .../sketch/gpu/operators/GpuDepthwiseConv2d.h | 95 ++++ filelist.json | 6 +- src/dynamic_fusion/sketch/OperatorAttributes.cpp | 1 - .../attributes/DepthwiseConv2dAttributes.cpp | 85 ++++ .../components/cl/ClComponentDepthwiseConv2d.cpp | 220 ++++++++++ .../gpu/components/cl/ClComponentDepthwiseConv2d.h | 171 ++++++++ .../gpu/components/cl/ClComponentDirectConv2d.cpp | 10 +- .../sketch/gpu/operators/GpuDepthwiseConv2d.cpp | 346 +++++++++++++++ .../cl/ClTemplateDepthwiseConv2d.cpp | 378 ++++++++++++++++ .../template_writer/cl/ClTemplateDepthwiseConv2d.h | 111 +++++ .../template_writer/cl/ClTemplateDirectConv2d.cpp | 26 +- .../gpu/template_writer/cl/ClTemplateStore.cpp | 23 +- .../dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp | 477 +++++++++++++++++++++ .../dynamic_fusion/gpu/cl/DepthwiseConv2dFixture.h | 222 ++++++++++ utils/TypePrinter.h | 36 +- 17 files changed, 2238 insertions(+), 48 deletions(-) create mode 100644 arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h create mode 100644 arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h create mode 100644 src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp create mode 100644 src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp create mode 100644 src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h create mode 100644 src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp create mode 100644 src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp create mode 100644 src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h create mode 100644 tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp create mode 100644 tests/validation/fixtures/dynamic_fusion/gpu/cl/DepthwiseConv2dFixture.h diff --git a/Android.bp b/Android.bp index 90d34179d6..246cd59cf6 100644 --- a/Android.bp +++ b/Android.bp @@ -586,6 +586,7 @@ cc_library_static { "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp", "src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp", "src/dynamic_fusion/sketch/OperatorAttributes.cpp", + "src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp", "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp", "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp", "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp", @@ -594,10 +595,13 @@ cc_library_static { "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp", "src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp", "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp", + "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp", "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp", "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp", "src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp", + "src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp", "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp", + "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp", "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp", "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp", "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp", diff --git a/arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h b/arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h new file mode 100644 index 0000000000..6d05e9e4d6 --- /dev/null +++ b/arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_DEPTHWISECONV2DATTRIBUTES +#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_DEPTHWISECONV2DATTRIBUTES + +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/Types.h" +#include + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Attributes are backend-agnostic parameters (in addition to the input/output tensors) of an operator. + */ + +/** Depthwise Conv2d attributes */ +class DepthwiseConv2dAttributes +{ +public: + /** Set padding */ + DepthwiseConv2dAttributes &pad(const Padding2D &pad); + /** Get padding */ + Padding2D pad() const; + /** Set stride */ + DepthwiseConv2dAttributes &stride(const Size2D &stride); + /** Get stride */ + Size2D stride() const; + /** Set dilation */ + DepthwiseConv2dAttributes &dilation(const Size2D &dilation); + /** Get dilation */ + Size2D dilation() const; + /** Set depth multiplier */ + DepthwiseConv2dAttributes &depth_multiplier(const uint32_t &depth_multiplier); + /** Get depth multiplier */ + uint32_t depth_multiplier() const; + /** Set Dimension rounding type */ + DepthwiseConv2dAttributes &dimension_rounding_type(const DimensionRoundingType &dimension_rounding_type); + /** Get Dimension rounding type */ + DimensionRoundingType dimension_rounding_type() const; + +private: + Padding2D _pad{}; /**< Padding */ + Size2D _stride{ 1U, 1U }; /**< Stride */ + Size2D _dilation{ 1U, 1U }; /**< Dilation */ + uint32_t _depth_multiplier{ 1U }; /**< Depth multiplier */ + DimensionRoundingType _dimension_rounding_type{ DimensionRoundingType::FLOOR }; /**< Dimension rounding type */ +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_DEPTHWISECONV2DATTRIBUTES */ diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h new file mode 100644 index 0000000000..a36ab62143 --- /dev/null +++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUDEPTHWISECONV2D +#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUDEPTHWISECONV2D + +#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +class GpuWorkloadContext; +class GpuWorkloadSketch; + +/** Operator interface. */ +class GpuDepthwiseConv2d final +{ +public: + /** Attributes are a set of backend-agnostic parameters that define what an operator does */ + using Attributes = DepthwiseConv2dAttributes; + /** Create an operator and fuse it into the workload sketch. + * @note If @ref validate_op() fails, the creation also fails and may throw an error. + * @note If @ref validate_op() fails, @p sketch remains unchanged and valid. + * + * Valid data type configurations: + * |src |wei |bia |dst | + * |:--------------|:--------------|:--------------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * + * Valid data layouts: + * - NHWC + * + * @param[in,out] sketch Workload sketch into which the operator will be fused + * @param[in] src Source tensor + * @param[in] wei Weight tensor + * @param[in] bia (Optional) Bias tensor + * @param[out] dst Destination tensor. If an uninitialized ITensorInfo is passed in, it will be auto-initialized + * @param[in] attributes Operator attributes + */ + static void create_op(GpuWorkloadSketch &sketch, + ITensorInfo *src, + ITensorInfo *wei, + ITensorInfo *bia, + ITensorInfo *dst, + const Attributes &attributes); + + /** Check if the operator configuration is supported, irrespective of fusion + * Similar to @ref GpuDepthwiseConv2d::create_op() + */ + static Status is_supported_op(const GpuWorkloadContext &context, + const ITensorInfo *src, + const ITensorInfo *wei, + const ITensorInfo *bia, + const ITensorInfo *dst, + const Attributes &attributes); + + /** Check if the operator configuration is supported and if it can be fused into the workload sketch. + * Similar to @ref GpuDepthwiseConv2d::create_op() + */ + static Status validate_op(const GpuWorkloadSketch &sketch, + const ITensorInfo *src, + const ITensorInfo *wei, + const ITensorInfo *bia, + const ITensorInfo *dst, + const Attributes &attributes); +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUDEPTHWISECONV2D */ diff --git a/filelist.json b/filelist.json index 513ee6e393..4d8db19ea5 100644 --- a/filelist.json +++ b/filelist.json @@ -2121,7 +2121,8 @@ "dynamic_fusion": [ "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp", "src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp", - + + "src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp", "src/dynamic_fusion/sketch/OperatorAttributes.cpp", "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.cpp", "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp", @@ -2131,9 +2132,12 @@ "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp", "src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp", "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp", + "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp", "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp", "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp", "src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp", + "src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp", + "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp", "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp", "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp", "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp", diff --git a/src/dynamic_fusion/sketch/OperatorAttributes.cpp b/src/dynamic_fusion/sketch/OperatorAttributes.cpp index 51ec444587..205ce687a3 100644 --- a/src/dynamic_fusion/sketch/OperatorAttributes.cpp +++ b/src/dynamic_fusion/sketch/OperatorAttributes.cpp @@ -57,7 +57,6 @@ Size2D Conv2dAttributes::dilation() const { return _dilation; } - } // namespace dynamic_fusion } // namespace experimental } // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp b/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp new file mode 100644 index 0000000000..3a5657e07b --- /dev/null +++ b/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::pad(const Padding2D &pad) +{ + _pad = pad; + return *this; +} +Padding2D DepthwiseConv2dAttributes::pad() const +{ + return _pad; +} +DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::stride(const Size2D &stride) +{ + _stride = stride; + return *this; +} +Size2D DepthwiseConv2dAttributes::stride() const +{ + return _stride; +} +DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::dilation(const Size2D &dilation) +{ + _dilation = dilation; + return *this; +} +Size2D DepthwiseConv2dAttributes::dilation() const +{ + return _dilation; +} + +DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::depth_multiplier(const uint32_t &depth_multiplier) +{ + _depth_multiplier = depth_multiplier; + return *this; +} + +uint32_t DepthwiseConv2dAttributes::depth_multiplier() const +{ + return _depth_multiplier; +} + +DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::dimension_rounding_type(const DimensionRoundingType &dimension_rounding_type) +{ + _dimension_rounding_type = dimension_rounding_type; + return *this; +} + +DimensionRoundingType DepthwiseConv2dAttributes::dimension_rounding_type() const +{ + return _dimension_rounding_type; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp new file mode 100644 index 0000000000..5626093079 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClComponentDepthwiseConv2d.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h" +#include "src/core/CL/CLValidate.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +using Settings = ClComponentDepthwiseConv2dSettings; + +Settings &Settings::export_input_to_cl_image(bool cl_image) +{ + _export_input_to_cl_image = cl_image; + return *this; +} + +bool Settings::export_input_to_cl_image() const +{ + return _export_input_to_cl_image; +} + +Settings &Settings::export_weights_to_cl_image(bool cl_image) +{ + _export_weights_to_cl_image = cl_image; + return *this; +} + +bool Settings::export_weights_to_cl_image() const +{ + return _export_weights_to_cl_image; +} + +Settings &Settings::fast_relaxed_math(bool fast_relaxed_math) +{ + _fast_relaxed_math = fast_relaxed_math; + return *this; +} + +bool Settings::fast_relaxed_math() const +{ + return _fast_relaxed_math; +} + +Settings &Settings::is_fma_available(bool is_fma_available) +{ + _is_fma_available = is_fma_available; + return *this; +} + +bool Settings::is_fma_available() const +{ + return _is_fma_available; +} + +Settings &Settings::n0(unsigned int n0) +{ + _n0 = n0; + return *this; +} + +unsigned int Settings::n0() const +{ + return _n0; +} + +Settings &Settings::m0(unsigned int m0) +{ + _m0 = m0; + return *this; +} + +unsigned int Settings::m0() const +{ + return _m0; +} + +Status ClComponentDepthwiseConv2d::validate( + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings) +{ + ARM_COMPUTE_UNUSED(properties, settings); + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto wei = tensors.get_const_tensor(TensorType::ACL_SRC_1); + const auto bia = tensors.get_const_tensor(TensorType::ACL_SRC_2); + const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0); + + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei, dst); + + // 1. Check validity + // Matching data type + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + if(bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia); + } + + // Matching data layout + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); + if(bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia); + } + + // All tensor infos are initialized + ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); + if(bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0); + } + // Device requirements are met + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + // wei shape is correct + const DataLayout data_layout = src->data_layout(); + const size_t channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) != (src->dimension(channel_idx) * attributes.depth_multiplier())); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 3, "Weights can be at most 3 dimensional"); + + // dst shape is correct + const PadStrideInfo pad_stride_info = PadStrideInfo(attributes.stride().x(), attributes.stride().y(), + attributes.pad().left, attributes.pad().right, + attributes.pad().top, attributes.pad().bottom, + attributes.dimension_rounding_type()); + const ConvolutionInfo conv_info{ pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(), attributes.dilation() }; + const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape); + + // Check strides and dilation + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first < 1); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().second < 1); + ARM_COMPUTE_RETURN_ERROR_ON((conv_info.dilation.x() < 1) || (conv_info.dilation.y() < 1)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && settings.m0() != 1); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && settings.m0() != 1); + + if(conv_info.depth_multiplier > 1 && settings.n0() > 1) + { + ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % settings.n0()) != 0); + } + + // Check export weights to cl image + ARM_COMPUTE_RETURN_ERROR_ON_MSG((settings.export_weights_to_cl_image() == true) && (export_to_cl_image(wei) == false), "Weights cannot be exported to cl_image!"); + ARM_COMPUTE_RETURN_ERROR_ON((settings.export_weights_to_cl_image() == true) && ((settings.n0() % 4) != 0)); + + ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) != (src->dimension(channel_idx) * conv_info.depth_multiplier)); + + // bia shape is correct + if(bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != output_shape[channel_idx], + "Biases size and number of dst feature maps should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1, "Biases should be one dimensional"); + } + + // 2. Check support level + // Data type + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + // Data layout + ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); + // Texture in the input tensor + ARM_COMPUTE_RETURN_ERROR_ON((settings.export_input_to_cl_image() == true)); + + return Status{}; +} + +ClComponentDepthwiseConv2d::ClComponentDepthwiseConv2d( + ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuKernelComponent{ id, properties, tensors }, + _component_writer{ std::make_unique(id, tensors, attributes, settings) } +{ +} +ClComponentDepthwiseConv2d::~ClComponentDepthwiseConv2d() +{ +} +const IGpuTemplateComponentWriter *ClComponentDepthwiseConv2d::template_writer() const +{ + return _component_writer.get(); +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h new file mode 100644 index 0000000000..0e2b5f14cb --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D + +#include "arm_compute/core/Error.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" +#include + +namespace arm_compute +{ +/** Forward declaration */ +class ITensorInfo; +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +template +class ArgumentPack; +class DepthwiseConv2dAttributes; + +/** Component specific settings + */ +class ClComponentDepthwiseConv2dSettings +{ +public: + /** Set export_input_to_cl_image flag */ + ClComponentDepthwiseConv2dSettings &export_input_to_cl_image(bool cl_image); + /** Get export_input_to_cl_image flag */ + bool export_input_to_cl_image() const; + + /** Set export_weights_to_cl_image flag */ + ClComponentDepthwiseConv2dSettings &export_weights_to_cl_image(bool cl_image); + /** Get export_weights_to_cl_image flag */ + bool export_weights_to_cl_image() const; + + /** Set fast_relaxed_math flag */ + ClComponentDepthwiseConv2dSettings &fast_relaxed_math(bool fast_relaxed_math); + /** Get fast_relaxed_math flag */ + bool fast_relaxed_math() const; + + /** Set is_fma_available flag */ + ClComponentDepthwiseConv2dSettings &is_fma_available(bool is_fma_available); + /** Get is_fma_available flag */ + bool is_fma_available() const; + + /** Set N0: number of columns processed by each thread */ + ClComponentDepthwiseConv2dSettings &n0(unsigned int n0); + /** Get N0: number of columns processed by each thread */ + unsigned int n0() const; + + /** Set M0: number of rows processed by each thread */ + ClComponentDepthwiseConv2dSettings &m0(unsigned int m0); + /** Set M0: number of rows processed by each thread */ + unsigned int m0() const; + +private: + bool _export_input_to_cl_image{ false }; /**< Export input to cl_image */ + bool _export_weights_to_cl_image{ false }; /**< Export the weights to cl_image */ + bool _fast_relaxed_math{ true }; /**< Enable/disable -cl-fast-relaxed-math flag */ + bool _is_fma_available{ false }; /**< Is fma instruction available */ + unsigned int _n0{ 0 }; /**< Number of columns processed by each thread */ + unsigned int _m0{ 0 }; /**< Number of rows processed by each thread */ +}; + +/** Forward declaration */ +class ClTemplateDepthwiseConv2d; + +class ClComponentDepthwiseConv2d final : public IGpuKernelComponent +{ +public: + /** Attributes are a set of backend-agnostic parameters that define what a component does */ + using Attributes = DepthwiseConv2dAttributes; + /** Settings are a set of backend-specific parameters that influence the implementation of a component */ + using Settings = ClComponentDepthwiseConv2dSettings; + +public: + /** Validate the component + * + * @param[in] properties Component properties @ref Properties + * @param[in,out] tensors Tensor arguments to the component + * @param[in] attributes Component attributes @ref Attributes + * @param[in] settings Component settings @ref Settings + * + * @return Status Validation results + * + * Tensor argument names: + * - ACL_SRC_0: Input + * - ACL_SRC_1: Weight + * - ACL_SRC_2: Bias (Optional) + * - ACL_DST_0: Output + * + * Tensor argument constness: + * - ACL_SRC_0: Const + * - ACL_SRC_1: Const + * - ACL_SRC_2: Const + * - ACL_DST_0: Const + * + * Valid data layouts: + * - NHWC + * + * Valid data type configurations: + * |ACL_SRC_0 |ACL_SRC_1 |ACL_SRC_2 |ACL_DST_0 | + * |:--------------|:--------------|:--------------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + */ + static Status validate( + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings); + + /** Constructor + * + * Similar to @ref ClComponentDepthwiseConv2d::validate() + */ + ClComponentDepthwiseConv2d( + ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings); + + /** Destructor */ + ~ClComponentDepthwiseConv2d() override; + /** Prevent instances of this class from being copy constructed */ + ClComponentDepthwiseConv2d(const ClComponentDepthwiseConv2d &component) = delete; + /** Prevent instances of this class from being copied */ + ClComponentDepthwiseConv2d &operator=(const ClComponentDepthwiseConv2d &component) = delete; + /** Allow instances of this class to be move constructed */ + ClComponentDepthwiseConv2d(ClComponentDepthwiseConv2d &&component) = default; + /** Allow instances of this class to be moved */ + ClComponentDepthwiseConv2d &operator=(ClComponentDepthwiseConv2d &&component) = default; + /** Get template writer for the component */ + const IGpuTemplateComponentWriter *template_writer() const override; + /** Get component type */ + GpuComponentType type() const override + { + return GpuComponentType::Complex; + } + +private: + std::unique_ptr _component_writer; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D */ diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp index e94cfd1581..dc05825500 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp @@ -35,26 +35,24 @@ namespace experimental { namespace dynamic_fusion { -using Settings = ClComponentDirectConv2dSettings; - -Settings &Settings::export_to_cl_image(bool cl_image) +ClComponentDirectConv2dSettings &ClComponentDirectConv2dSettings::export_to_cl_image(bool cl_image) { _export_to_cl_image = cl_image; return *this; } -bool Settings::export_to_cl_image() const +bool ClComponentDirectConv2dSettings::export_to_cl_image() const { return _export_to_cl_image; } -Settings &Settings::fast_relaxed_math(bool fast_relaxed_math) +ClComponentDirectConv2dSettings &ClComponentDirectConv2dSettings::fast_relaxed_math(bool fast_relaxed_math) { _fast_relaxed_math = fast_relaxed_math; return *this; } -bool Settings::fast_relaxed_math() const +bool ClComponentDirectConv2dSettings::fast_relaxed_math() const { return _fast_relaxed_math; } diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp new file mode 100644 index 0000000000..89f1e999b8 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h" + +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h" +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +bool export_weights_to_cl_image_heuristic(const ITensorInfo *weights, unsigned int depth_multiplier, GPUTarget gpu_target) +{ + if(!export_to_cl_image(weights)) + { + return false; + } + + const size_t idx_w = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); + const size_t kernel_w = weights->tensor_shape()[idx_w]; + const size_t kernel_h = weights->tensor_shape()[idx_h]; + + if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD) + { + return false; + } + + if((kernel_w == 1) && (kernel_h == 1)) + { + return false; + } + + if(depth_multiplier > 1) + { + if((depth_multiplier % 4) != 0) + { + return false; + } + } + + return true; +} + +void initialize_dwc_native_compute_info(DWCComputeKernelInfo &dwc_compute_info, const ITensorInfo *input, const ITensorInfo *weights, + const DepthwiseConv2dAttributes &attributes, const GPUTarget gpu_target) +{ + const unsigned int depth_multiplier = attributes.depth_multiplier(); + + // Floating point path + // First check if we can export to cl_image. + dwc_compute_info.export_input_to_cl_image = false; + dwc_compute_info.export_weights_to_cl_image = export_weights_to_cl_image_heuristic(weights, depth_multiplier, gpu_target); + + // Set n0 + if(depth_multiplier == 1) + { + if(dwc_compute_info.export_weights_to_cl_image == false && weights->data_type() == DataType::F16) + { + dwc_compute_info.n0 = 8; + } + else + { + dwc_compute_info.n0 = 4; + } + } + else + { + if((depth_multiplier % 4) == 0) + { + dwc_compute_info.n0 = 4; + } + else if((depth_multiplier % 2) == 0) + { + dwc_compute_info.n0 = 2; + } + else + { + dwc_compute_info.n0 = 1; + } + } + + dwc_compute_info.n0 = adjust_vec_size(dwc_compute_info.n0, weights->dimension(0)); + + // Set m0 only if stride_x == 1 and dilation_x == 1 + if(attributes.stride().x() == 1 && attributes.dilation().x() == 1) + { + const size_t idx_w = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const size_t kernel_w = weights->tensor_shape()[idx_w]; + + if((kernel_w >= 9) || (kernel_w == 1)) + { + dwc_compute_info.m0 = 1; + } + else + { + if(weights->data_type() == DataType::F16) + { + if((input->dimension(1) % 5) == 0) + { + dwc_compute_info.m0 = 5; + } + else + { + dwc_compute_info.m0 = 4; + } + } + else + { + dwc_compute_info.m0 = 2; + } + } + } + else + { + dwc_compute_info.m0 = 1; + } + return; +} + +void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ITensorInfo *wei, const DepthwiseConv2dAttributes &attributes) +{ + if(dst->total_size() == 0U) + { + const PadStrideInfo pad_stride_info(attributes.stride().x(), + attributes.stride().y(), + attributes.pad().left, + attributes.pad().right, + attributes.pad().top, + attributes.pad().bottom, + attributes.dimension_rounding_type()); + + const ConvolutionInfo conv_info{ pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(), attributes.dilation() }; + const TensorShape shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info); + + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape)); + } +} + +constexpr GpuOperatorType operator_type = GpuOperatorType::Complex; +} // namespace + +Status GpuDepthwiseConv2d::is_supported_op(const GpuWorkloadContext &context, + const ITensorInfo *src, + const ITensorInfo *wei, + const ITensorInfo *bia, + const ITensorInfo *dst, + const DepthwiseConv2dAttributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei, dst); + + // Auto initialize dst tensor info + TensorInfo dst_info_to_validate = *dst; + calculate_and_init_dst_if_empty(&dst_info_to_validate, src, wei, attributes); + + // Check support level + // Data type + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + // Data layout + ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); + + const GpuTarget gpu_target = context.gpu_target(); + + if(context.gpu_language() == GpuLanguage::OpenCL) + { + const CLCompileContext *cl_compile_ctx = context.cl_compile_context(); + ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); + + // Validate Depthwise Conv2d Component + { + const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + auto settings = ClComponentDepthwiseConv2d::Settings(); + + DWCComputeKernelInfo dwc_info; + initialize_dwc_native_compute_info(dwc_info, src, wei, attributes, gpu_target); + + settings.fast_relaxed_math( + (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) + && (dst_info_to_validate.data_type() == DataType::F32 || dst_info_to_validate.data_type() == DataType::F16)); + + settings.is_fma_available(get_arch_from_target(gpu_target) == GPUTarget::MIDGARD) + .m0(dwc_info.m0) + .n0(dwc_info.n0) + .export_input_to_cl_image(dwc_info.export_input_to_cl_image) + .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image); + + ArgumentPack arguments; + arguments.add_const_tensor(ACL_SRC_0, src); + arguments.add_const_tensor(ACL_SRC_1, wei); + arguments.add_const_tensor(ACL_SRC_2, bia); + arguments.add_const_tensor(ACL_DST_0, &dst_info_to_validate); + ARM_COMPUTE_RETURN_ON_ERROR(ClComponentDepthwiseConv2d::validate(properties, arguments, attributes, settings)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language"); + } + + return Status{}; +} + +Status GpuDepthwiseConv2d::validate_op(const GpuWorkloadSketch &sketch, + const ITensorInfo *src, + const ITensorInfo *wei, + const ITensorInfo *bia, + const ITensorInfo *dst, + const DepthwiseConv2dAttributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei, dst); + ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !wei->has_valid_id() || !dst->has_valid_id()); + + if(bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id()); + } + + // Auto initialize dst tensor info + TensorInfo dst_info_to_validate = *dst; + calculate_and_init_dst_if_empty(&dst_info_to_validate, src, wei, attributes); + + // Perform fusion test + // Pack tensor infos + ArgumentPack tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_const_tensor(ACL_SRC_1, wei); + tensors.add_const_tensor(ACL_SRC_2, bia); + tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate); + const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op), + "Operator fusion test failed. This operator cannot be fused into the workload"); + + // Check if configuration is supported + return is_supported_op(*sketch.gpu_context(), src, wei, bia, &dst_info_to_validate, attributes); +} + +void GpuDepthwiseConv2d::create_op(GpuWorkloadSketch &sketch, + ITensorInfo *src, + ITensorInfo *wei, + ITensorInfo *bia, + ITensorInfo *dst, + const DepthwiseConv2dAttributes &attributes) +{ + // Assert validation + ARM_COMPUTE_ERROR_THROW_ON(GpuDepthwiseConv2d::validate_op(sketch, src, wei, bia, dst, attributes)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, wei, dst); + ARM_COMPUTE_LOG_PARAMS(src, wei, bia, dst, attributes); + + calculate_and_init_dst_if_empty(dst, src, wei, attributes); + + // Translate into components and add to component graph + GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph(); + const auto *sketch_ctx = sketch.implementation().context(); + const GpuTarget gpu_target = sketch_ctx->gpu_target(); + + if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + { + const auto cl_compile_ctx = sketch_ctx->cl_compile_context(); + ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr); + + // Add Depthwise Conv2d Component + { + const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + auto settings = ClComponentDepthwiseConv2d::Settings(); + + DWCComputeKernelInfo dwc_info; + initialize_dwc_native_compute_info(dwc_info, src, wei, attributes, gpu_target); + + settings.is_fma_available(get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) + .m0(dwc_info.m0) + .n0(dwc_info.n0) + .export_input_to_cl_image(dwc_info.export_input_to_cl_image) + .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image); + + if(settings.export_input_to_cl_image()) + { + arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(src); + } + + if(settings.export_weights_to_cl_image()) + { + arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei); + } + + ArgumentPack arguments; + arguments.add_const_tensor(ACL_SRC_0, src); + arguments.add_const_tensor(ACL_SRC_1, wei); + arguments.add_const_tensor(ACL_SRC_2, bia); + arguments.add_const_tensor(ACL_DST_0, dst); + comp_graph.add_new_component(properties, arguments, attributes, settings); + } + } + else + { + ARM_COMPUTE_ERROR("Unimplemented Gpu language"); + } + + // Set up fusion test by adding to the Operator Group + // Note this has to be performed after all the components have been successfully added to the component graph + + // Pack tensor infos + ArgumentPack tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_const_tensor(ACL_SRC_1, wei); + tensors.add_const_tensor(ACL_SRC_2, bia); + tensors.add_const_tensor(ACL_DST_0, dst); + + const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + sketch.implementation().operator_group().add_operator(op); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp new file mode 100644 index 0000000000..389bd5c65f --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp @@ -0,0 +1,378 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClTemplateDepthwiseConv2d.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +ClTemplateDepthwiseConv2d::ClTemplateDepthwiseConv2d(ComponentId id, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuTemplateComponentWriter{ id, tensors }, + _src{}, + _weight{}, + _bias{}, + _dst{}, + _attributes{ attributes }, + _settings{ settings } +{ + _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); + if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2)) + { + _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2); + } + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _dst); +} + +std::string ClTemplateDepthwiseConv2d::get_name() const +{ + return "depthwise_conv2d"; +} + +std::string ClTemplateDepthwiseConv2d::get_component_code(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + + constexpr int height_idx = 2; // Data Layout is NHWC + + std::string code = R"_( +//------------------ START KERNEL {{meta_kernel_id}} --------------------- +// IN_0(src) {{src}} +// IN_1(wei) {{weight}} +)_"; + + if(_bias != nullptr && _bias->has_valid_id()) + { + code += R"_( +// IN_1(bia) {{bias}} +)_"; + } + + code += R"_( +// OUT(dst, accum) {{dst}} + +TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}}); +TILE(uint, M0, 1, g_dst_indirect_y); + +{ +#define _IWEI_WIDTH {{WEI_WIDTH}} +#define _IWEI_HEIGHT {{WEI_HEIGHT}} +#define _IDST_WIDTH {{arg_dst}}_w +#define _IDST_HEIGHT {{arg_dst}}_h +#define _IM0_A M0_A +#define _IN0_A N0_A +#define _IM0_B _IWEI_WIDTH +#define _IN0_B N0 +#define _IBOUNDARY_CHECK (!((_IWEI_WIDTH == 1 && _IWEI_HEIGHT == 1 && {{PAD_LEFT}} == 0 && {{PAD_TOP}} == 0 && M0 == 1))) +)_"; + + code += R"_( + const int yo = g_ind_2 % {{arg_dst}}_h; + const int bout = g_ind_2 / {{arg_dst}}_h; +)_"; + + code += R"_( + + int xi = g_ind_1 * {{STRIDE_X}}; + int yi = yo * {{STRIDE_Y}}; + xi -= {{PAD_LEFT}}; + yi -= {{PAD_TOP}}; + + LOOP_UNROLLING(int, i, 0, 1, M0, + { + {{dst}}[i].v = 0; + }) +)_"; + + if(_weight->dimension(height_idx) < 5) + { + code += R"_( + LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT, +)_"; + } + else + { + code += R"_( + for(int yk = 0; yk < _IWEI_HEIGHT; ++yk) +)_"; + } + + code += R"_( + { + TILE({{SRC_DATA_TYPE}}, _IM0_A, _IN0_A, a); + + LOOP_UNROLLING(int, i, 0, 1, _IM0_A, + { + a[i].v = 0; + }) + + T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, _IM0_A, _IN0_A, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi + yk * {{DILATION_Y}}, xi, (g_ind_0 / {{DEPTH_MULTIPLIER}}), {{src}}_w, {{src}}_h, {{DILATION_X}}, 1, _IBOUNDARY_CHECK, a); + + TILE({{WEI_DATA_TYPE}}, _IM0_B, _IN0_B, b); + + T_LOAD({{WEI_DATA_TYPE}}, _IM0_B, _IN0_B, {{WEI_TENSOR_TYPE}}, {{weight}}, g_ind_0, yk * _IM0_B, 1, {{weight}}_stride_y, b); + + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH, + { +)_"; + + if(!_settings.is_fma_available()) + { + code += R"_( + {{dst}}[m0].v += a[xk + m0].v * b[xk].v; +)_"; + } + else + { + code += R"_( + {{dst}}[m0].v = fma(a[xk + m0].v, b[xk].v, {{dst}}[m0].v); +)_"; + } + + code += R"_( + }) + }) + } +)_"; + + if(_weight->dimension(height_idx) < 5) + { + code += R"_( + ) +)_"; + } + + if(_bias && _bias->has_valid_id()) + { + code += R"_( + TILE({{BIA_DATA_TYPE}}, 1, N0, {{bias}}); + + T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 0, 0, {{bias}}); + + T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, {{bias}}, {{dst}}); +)_"; + } + + code += R"_( + LOOP_UNROLLING(int, i, 0, 1, M0, + { + g_dst_indirect_y[i].v = (uint)min((int)(g_ind_1 + i), (int)({{arg_dst}}_w) - 1); + g_dst_indirect_y[i].v += (int)(g_ind_2 % {{arg_dst}}_h) * (int)({{arg_dst}}_w); + g_dst_indirect_y[i].v += (int)(g_ind_2 / {{arg_dst}}_h) * (int)({{arg_dst}}_w * {{arg_dst}}_h); + }) +} +//------------------ END KERNEL {{meta_kernel_id}} --------------------- +)_"; + + return code; +} + +void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +{ + const GpuKernelArgumentInfo::Type input_type = _settings.export_input_to_cl_image() ? + GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : + GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; + + vtable.declare_variable( + _src, + GpuKernelArgumentInfo(input_type), + comp_group.is_intermediate_tensor(_src), + "src"); + + const GpuKernelArgumentInfo::Type weight_type = _settings.export_weights_to_cl_image() ? + GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : + GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; + + vtable.declare_variable( + _weight, + GpuKernelArgumentInfo(weight_type), + comp_group.is_intermediate_tensor(_weight), + "weight"); + + if(_bias != nullptr && _bias->has_valid_id()) // optional bias + { + vtable.declare_variable( + _bias, + GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), + comp_group.is_intermediate_tensor(_bias), + "bias"); + } + vtable.declare_variable( + _dst, + GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + comp_group.is_intermediate_tensor(_dst), + "dst"); +} + +TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +{ + TagLUT lut{}; + + // Arguments and global shared variables + lut["src"] = vtable.get_variable(_src); + lut["weight"] = vtable.get_variable(_weight); + + if(_bias != nullptr && _bias->has_valid_id()) // optional bias + { + lut["bias"] = vtable.get_variable(_bias); + lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type()); + } + lut["dst"] = vtable.get_variable(_dst); + + const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]); + lut["arg_dst"] = dst_argument.uniq_name; + + // Local build options + lut["meta_kernel_id"] = id(); + lut["ACC_DATA_TYPE"] = _src->data_type(); + lut["SRC_DATA_TYPE"] = _src->data_type(); + lut["WEI_DATA_TYPE"] = _weight->data_type(); + + switch(vtable.get_variable(_src).kernel_argument_info.type) + { + case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D: + case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D: + case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image: + lut["SRC_TENSOR_TYPE"] = "IMAGE"; + break; + default: + lut["SRC_TENSOR_TYPE"] = "BUFFER"; + break; + } + + switch(vtable.get_variable(_weight).kernel_argument_info.type) + { + case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D: + case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D: + case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image: + lut["WEI_TENSOR_TYPE"] = "IMAGE"; + break; + default: + lut["WEI_TENSOR_TYPE"] = "BUFFER"; + break; + } + + // Data Layout is NHWC + constexpr int width_idx = 1; + constexpr int height_idx = 2; + + lut["WEI_WIDTH"] = _weight->dimension(width_idx); + lut["WEI_HEIGHT"] = _weight->dimension(height_idx); + + lut["STRIDE_X"] = _attributes.stride().x(); + lut["STRIDE_Y"] = _attributes.stride().y(); + + lut["PAD_LEFT"] = _attributes.pad().left; + lut["PAD_TOP"] = _attributes.pad().top; + + lut["DILATION_X"] = _attributes.dilation().x(); + lut["DILATION_Y"] = _attributes.dilation().y(); + + lut["DEPTH_MULTIPLIER"] = _attributes.depth_multiplier(); + + return lut; +} + +CLBuildOptions ClTemplateDepthwiseConv2d::get_build_options(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + + constexpr unsigned int width_idx = 1; // Data Layout is NHWC + + const unsigned int n0 = _settings.n0(); + const unsigned int m0 = _settings.m0(); + const unsigned int m0_a = _weight->dimension(width_idx) + m0 - 1; + const unsigned int n0_a = _attributes.depth_multiplier() > 1 ? 1 : n0; + const unsigned int partial_store_n0 = _dst->dimension(0) % n0; + + CLBuildOptions build_opts{}; + + if(_settings.fast_relaxed_math()) + { + build_opts.add_option("-cl-fast-relaxed-math"); + } + else + { + // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations + // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations + build_opts.add_option("-cl-unsafe-math-optimizations"); + } + + build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); + build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); + build_opts.add_option("-DN0_A=" + support::cpp11::to_string(n0_a)); + build_opts.add_option("-DM0_A=" + support::cpp11::to_string(m0_a)); + build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0)); + + return build_opts; +} + +std::string ClTemplateDepthwiseConv2d::get_config_id() const +{ + std::string config_id{}; + + config_id += support::cpp11::to_string(_src->dimension(0)); + config_id += "_"; + config_id += support::cpp11::to_string(_src->dimension(1)); + config_id += "_"; + config_id += support::cpp11::to_string(_src->dimension(2)); + config_id += "_"; + config_id += support::cpp11::to_string(_dst->dimension(0)); + config_id += "_"; + config_id += support::cpp11::to_string(_dst->dimension(1)); + config_id += "_"; + config_id += support::cpp11::to_string(_dst->dimension(2)); + config_id += "_"; + config_id += string_from_data_type(_src->data_type()); + + return config_id; +} + +std::set ClTemplateDepthwiseConv2d::get_headers_list() const +{ + return std::set{ "helpers.h", "tile_helpers.h" }; +} + +Window ClTemplateDepthwiseConv2d::get_window() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); + + Window win = calculate_max_window(*_dst, Steps(_settings.n0(), _settings.m0())); + return win.collapse(win, Window::DimZ); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h new file mode 100644 index 0000000000..84b689ef64 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDEPTHWISECONV2D +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDEPTHWISECONV2D + +#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class ClTemplateDepthwiseConv2d final : public IGpuTemplateComponentWriter +{ +public: + using Attributes = ClComponentDepthwiseConv2d::Attributes; + using Settings = ClComponentDepthwiseConv2d::Settings; + /** Constructor + * + * Similar to @ref ClComponentDepthwiseConv2d::validate() + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the components + * @param[in] attributes Component attributes + * @param[in] settings Component settings + */ + ClTemplateDepthwiseConv2d(ComponentId id, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings); + /** Prevent instances of this class from being copy constructed */ + ClTemplateDepthwiseConv2d(const ClTemplateDepthwiseConv2d &depthwise_conv2d) = delete; + /** Prevent instances of this class from being copied */ + ClTemplateDepthwiseConv2d &operator=(const ClTemplateDepthwiseConv2d &depthwise_conv2d) = delete; + /** Allow instances of this class to be move constructed */ + ClTemplateDepthwiseConv2d(ClTemplateDepthwiseConv2d &&depthwise_conv2d) = default; + /** Allow instances of this class to be moved */ + ClTemplateDepthwiseConv2d &operator=(ClTemplateDepthwiseConv2d &&depthwise_conv2d) = default; + /** Generate kernel component name */ + std::string get_name() const override; + /** Generate kernel component code template + * + * @param[in] comp_group Component group of which the component is a part of + * + * @return std::string Component code + */ + std::string get_component_code(const ComponentGroup &comp_group) const override; + /** Declare all variables used by the component in the @p vtable + * + * @param[out] vtable Variable table + * @param[in] comp_group Component group of which the component is a part of + */ + void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override; + /** Generate the tag look-up table used to instantiate the component code. + * + * @param[in] vtable Variable table + * @param[in] comp_group Component group of which the component is a part of + * + * @return TagLUT Tag lookup table + */ + TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override; + /** Generate the build options used in the component + * + * @param[in] comp_group Component group of which the component is a part of + * + * @return CLBuildOptions Build options + */ + CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override; + /** Generate the component config id string used for tuning */ + std::string get_config_id() const override; + /** Generate the header list used in the component */ + std::set get_headers_list() const override; + /** Generate the execution window for the component */ + Window get_window() const override; + +private: + const ITensorInfo *_src; + const ITensorInfo *_weight; + const ITensorInfo *_bias; + const ITensorInfo *_dst; + Attributes _attributes; + Settings _settings; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDEPTHWISECONV2D */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp index 870de64eb8..7ad7dd69f0 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp @@ -86,11 +86,10 @@ std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &com code += R"_( // OUT(dst, accum) {{dst}} -// Initialize the accumulators TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}}); +TILE(uint, M0, 1, g_dst_indirect_y); + { - // All the tensor dimensions are passed at compile time. - // In case of dynamic tensor support, the following dimensions should be passed as function argument. #define _IWEI_WIDTH {{WEI_WIDTH}} #define _IWEI_HEIGHT {{WEI_HEIGHT}} #define _ISRC_WIDTH {{src}}_w @@ -101,8 +100,6 @@ TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}}); #define _IDST_CHANNELS {{arg_dst}}_c #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT) - // .v = access the whole vector (OpenCL vector) - // .s[x] = access the vector element at position x (scalar access) TILE(int, M0, 1, xi); TILE(int, M0, 1, yi); @@ -132,7 +129,6 @@ TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}}); TILE({{SRC_DATA_TYPE}}, M0, K0, a); TILE({{WEI_DATA_TYPE}}, N0, K0, b); - // Initialize tiles LOOP_UNROLLING(int, i, 0, 1, M0, { a[i].v = {{ZERO_VALUE}}; @@ -143,32 +139,24 @@ TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}}); b[i].v = {{ZERO_VALUE}}; }) - // Load tile from the src tensor T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a); - // Load tile from the weights tensor T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b); - // Compute the matrix multiplication between two tiles T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}}); ck += K0; } - - // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS - // This #if directive should be removed in case of dynamic tensor support )_"; if(leftover_loop) { code += R"_( - // Left-over accumulations for(; k < _ISRC_CHANNELS; ++k) { TILE({{SRC_DATA_TYPE}}, M0, 1, a); TILE({{WEI_DATA_TYPE}}, N0, 1, b); - // Initialize tiles LOOP_UNROLLING(int, i, 0, 1, M0, { a[i].v = {{ZERO_VALUE}}; @@ -179,14 +167,10 @@ TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}}); b[i].v = {{ZERO_VALUE}}; }) - // Load tile from the src tensor T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a); - // Load tile from the weights tensor - // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b); - // Compute the matrix multiplication between two tiles T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}}); ++ck; @@ -215,12 +199,16 @@ code += R"_( T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 1, 0, bias0); - // c = c + bias[broadcasted] T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}}); )_"; } code += R"_( + LOOP_UNROLLING(int, i, 0, 1, M0, + { + g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{arg_dst}}_w * {{arg_dst}}_h) - 1); + g_dst_indirect_y[i].v += g_ind_2 * (int)({{arg_dst}}_w * {{arg_dst}}_h); + }) } //------------------ END KERNEL {{meta_kernel_id}} --------------------- )_"; diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp index 6c4b8f52f2..bffb467ebb 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp @@ -46,31 +46,14 @@ std::string ClTemplateStore::get_name() const std::string ClTemplateStore::get_component_code(const ComponentGroup &comp_group) const { ARM_COMPUTE_UNUSED(comp_group); + return R"_( //------------------ START KERNEL {{meta_kernel_id}} STORE --------------------- { -// This also follows NHWC layout -// g_ind_0 maps to global_id(0) maps to Channel -// g_ind_1 maps to global_id(1) maps to Height and Weight (Collapsed Window) -// g_ind_2 maps to global_id(2) maps to N / Batch -#define _IDST_WIDTH {{dst}}_w -#define _IDST_HEIGHT {{dst}}_h - TILE(uint, M0, 1, dst_indirect_y); - - // Calculate the destination indirect Y - LOOP_UNROLLING(int, i, 0, 1, M0, - { - dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1); - dst_indirect_y[i].v += g_ind_2 * (int)(_IDST_WIDTH * _IDST_HEIGHT); - }) - bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; - T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, g_ind_0, {{dst}}_stride_y, x_cond, {{src}}, dst_indirect_y); - -#undef _IDST_WIDTH -#undef _IDST_HEIGHT - //------------------ END KERNEL {{meta_kernel_id}} STORE --------------------- + T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, g_ind_0, {{dst}}_stride_y, x_cond, {{src}}, g_dst_indirect_y); +//------------------ END KERNEL {{meta_kernel_id}} STORE --------------------- } )_"; diff --git a/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp b/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp new file mode 100644 index 0000000000..f08cc60ea2 --- /dev/null +++ b/tests/validation/dynamic_fusion/gpu/cl/DepthwiseConv2d.cpp @@ -0,0 +1,477 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h" + +#include "tests/CL/CLAccessor.h" +#include "tests/datasets/DepthwiseConvolutionLayerDataset.h" +#include "tests/datasets/DilatedDepthwiseConvolutionLayerDataset.h" +#include "tests/framework/Asserts.h" +#include "tests/framework/Fixture.h" +#include "tests/framework/Macros.h" +#include "tests/framework/datasets/Datasets.h" +#include "tests/validation/Validation.h" +#include "tests/validation/fixtures/dynamic_fusion/gpu/cl/DepthwiseConv2dFixture.h" + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +const auto depth_multipliers = framework::dataset::make("DepthMultiplier", { 1U, 4U }); +const auto large_depth_multipliers = framework::dataset::make("DepthMultiplier", { 1, 2, 5, 8 }); + +TEST_SUITE(CL) +TEST_SUITE(DYNAMIC_FUSION) +TEST_SUITE(DEPTHWISE_CONV2D) + +RelativeTolerance tolerance_f32(0.01f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F32 */ +RelativeTolerance tolerance_f16(half_float::half(0.1)); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */ +constexpr float tolerance_num = 0.02f; /**< Tolerance number */ + +// *INDENT-OFF* +// clang-format off +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(zip( + framework::dataset::make("InputInfo", { TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC), // Mismatching data type input/weights + TensorInfo(TensorShape(3U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC), // Mismatching input feature maps + TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC), // Mismatching depth multiplier + TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC), // Invalid biases size + TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC), // Invalid biases dimensions + TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC), // Invalid output size + TensorInfo(TensorShape(8U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC), // patch size bigger than input width + TensorInfo(TensorShape(8U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC), // dilation < 1 + TensorInfo(TensorShape(8U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::QASYMM8, DataLayout::NHWC), // Unsupported data type + TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::QASYMM8_SIGNED, DataLayout::NHWC), // Unsupported data type + TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::QSYMM16, DataLayout::NHWC), // Unsupported data type + TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::QSYMM8, DataLayout::NHWC), // Unsupported data type + TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::QSYMM8_PER_CHANNEL, DataLayout::NHWC), // Unsupported data type + TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::QASYMM16, DataLayout::NHWC), // Unsupported data type + TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::U8, DataLayout::NHWC), // Unsupported data type + TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::S8, DataLayout::NHWC), // Unsupported data type + TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::U16, DataLayout::NHWC), // Unsupported data type + TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::S16, DataLayout::NHWC), // Unsupported data type + TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::U32, DataLayout::NHWC), // Unsupported data type + TensorInfo(TensorShape(8U, 32U, 13U), 1, DataType::S32, DataLayout::NHWC), // Unsupported data type + TensorInfo(TensorShape(32U, 13U, 8U), 1, DataType::F32, DataLayout::NCHW), // Unsupported data layout + TensorInfo(TensorShape(8U, 32U, 13U, 4U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(8U, 32U, 13U, 4U), 1, DataType::F32, DataLayout::NHWC), // weight dimension > 3 + TensorInfo(TensorShape(8U, 32U, 13U, 4U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(8U, 32U, 13U, 4U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(8U, 32U, 13U, 4U), 1, DataType::F32, DataLayout::NHWC), + }), + framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(2U, 3U, 3U, 2U), 1, DataType::F16, DataLayout::NHWC), + TensorInfo(TensorShape(2U, 3U, 3U, 2U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(2U, 3U, 3U, 2U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(2U, 3U, 3U, 2U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(2U, 3U, 3U, 2U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(2U, 3U, 3U, 2U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(16U, 3U, 3U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(16U, 3U, 3U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(16U, 3U, 3U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::QASYMM8, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::QASYMM8_SIGNED, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::QSYMM16, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::QSYMM8, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::QSYMM8_PER_CHANNEL, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::QASYMM16, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::U8, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::S8, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::U16, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::S16, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::U32, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::S32, DataLayout::NHWC), + TensorInfo(TensorShape(3U, 3U, 24U), 1, DataType::F32, DataLayout::NCHW), + TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 3U, 3U, 5U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 3U, 3U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 4U, 3U), 1, DataType::F32, DataLayout::NHWC), + })), + framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(2U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(2U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(4U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(2U, 2U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(2U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(16U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(16U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(16U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC), + TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC), + TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC), + TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC), + TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC), + TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC), + TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC), + TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC), + TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC), + TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC), + TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC), + TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NHWC), + TensorInfo(TensorShape(24U), 1, DataType::S32, DataLayout::NCHW), + TensorInfo(TensorShape(24U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(24U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(24U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(24U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(24U), 1, DataType::F32, DataLayout::NHWC), + })), + framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(2U, 25U, 11U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(2U, 25U, 11U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(2U, 25U, 11U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(2U, 25U, 11U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(2U, 25U, 11U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(2U, 27U, 13U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(16U, 25U, 11U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(16U, 25U, 11U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(16U, 25U, 11U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 32U, 11U), 1, DataType::QASYMM8, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 32U, 11U), 1, DataType::QASYMM8_SIGNED, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 32U, 11U), 1, DataType::QSYMM16, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 32U, 11U), 1, DataType::QSYMM8, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 32U, 11U), 1, DataType::QSYMM8_PER_CHANNEL, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 32U, 11U), 1, DataType::QASYMM16, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 32U, 11U), 1, DataType::U8, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 32U, 11U), 1, DataType::S8, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 32U, 11U), 1, DataType::U16, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 32U, 11U), 1, DataType::S16, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 32U, 11U), 1, DataType::U32, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 32U, 11U), 1, DataType::S32, DataLayout::NHWC), + TensorInfo(TensorShape(32U, 11U, 24U), 1, DataType::F32, DataLayout::NCHW), + TensorInfo(TensorShape(24U, 32U, 11U, 4U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 32U, 11U, 4U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 33U, 14U, 4U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 17U, 5U, 4U), 1, DataType::F32, DataLayout::NHWC), + TensorInfo(TensorShape(24U, 15U, 4U, 4U), 1, DataType::F32, DataLayout::NHWC), + })), + framework::dataset::make("Padding", { Padding2D(0, 0, 0, 0), + Padding2D(0, 0, 0, 0), + Padding2D(0, 0, 0, 0), + Padding2D(0, 0, 0, 0), + Padding2D(0, 0, 0, 0), + Padding2D(0, 0, 0, 0), + Padding2D(0, 0, 0, 0), + Padding2D(0, 0, 0, 0), + Padding2D(0, 0, 0, 0), + Padding2D(1, 1, 0, 0), + Padding2D(1, 1, 0, 0), + Padding2D(1, 1, 0, 0), + Padding2D(1, 1, 0, 0), + Padding2D(1, 1, 0, 0), + Padding2D(1, 1, 0, 0), + Padding2D(1, 1, 0, 0), + Padding2D(1, 1, 0, 0), + Padding2D(1, 1, 0, 0), + Padding2D(1, 1, 0, 0), + Padding2D(1, 1, 0, 0), + Padding2D(1, 1, 0, 0), + Padding2D(1, 1, 0, 0), + Padding2D(1, 1, 0, 0), + Padding2D(1, 1, 0, 0), + Padding2D(2, 1, 2, 1), + Padding2D(2, 1, 2, 1), + Padding2D(2, 1, 2, 1), + })), + framework::dataset::make("Stride", { Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(1, 1), + Size2D(2, 3), + Size2D(2, 3), + })), + framework::dataset::make("DepthMultiplier", { 1, + 1, + 3, + 1, + 1, + 1, + 2, + 2, + 2, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + 3, + })), + framework::dataset::make("Dilation", { Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(20U, 1U), + Size2D(0U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(1U, 1U), + Size2D(2U, 3U), + })), + framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, true, false, + false, false, false, false, false, false, false, false, false, false, + false, false, true, false, true, true, true })), + input_info, weights_info, biases_info, output_info, padding, stride, depth_multiplier, dilation, expected) +{ + CLCompileContext cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); + GpuWorkloadContext gpu_ctx = GpuWorkloadContext{ &cl_compile_ctx }; + GpuWorkloadSketch sketch{ &gpu_ctx }; + + const TensorInfo sketch_input_info = sketch.create_tensor_info(input_info); + const TensorInfo sketch_weights_info = sketch.create_tensor_info(weights_info); + const TensorInfo sketch_biases_info = sketch.create_tensor_info(biases_info); + const TensorInfo sketch_output_info = sketch.create_tensor_info(output_info); + + DepthwiseConv2dAttributes attributes {}; + attributes.pad(padding) + .stride(stride) + .dilation(dilation) + .depth_multiplier(depth_multiplier); + + const Status status = GpuDepthwiseConv2d::validate_op(sketch, &sketch_input_info, &sketch_weights_info, &sketch_biases_info, &sketch_output_info, attributes); + const bool res = bool(status); + ARM_COMPUTE_EXPECT(res == expected, framework::LogLevel::ERRORS); +} +// clang-format on +// *INDENT-ON* + +template +using DynamicFusionGpuDepthwiseConv2dFixture = DynamicFusionGpuDepthwiseConv2dValidationFixture; + +TEST_SUITE(Float) +TEST_SUITE(FP16) +TEST_SUITE(W3x3) +FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuDepthwiseConv2dFixture, framework::DatasetMode::ALL, + combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), + depth_multipliers), + framework::dataset::make("DataType", DataType::F16)), + framework::dataset::make("DataLayout", DataLayout::NHWC))) +{ + validate(CLAccessor(_target), _reference, tolerance_f16); +} +FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuDepthwiseConv2dFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(), + large_depth_multipliers), + framework::dataset::make("DataType", DataType::F16)), + framework::dataset::make("DataLayout", DataLayout::NHWC))) +{ + validate(CLAccessor(_target), _reference, tolerance_f16); +} +TEST_SUITE(Dilation) +FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuDepthwiseConv2dFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), + depth_multipliers), + framework::dataset::make("DataType", DataType::F16)), + framework::dataset::make("DataLayout", { DataLayout::NHWC }))) +{ + validate(CLAccessor(_target), _reference, tolerance_f16); +} +FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuDepthwiseConv2dFixture, framework::DatasetMode::NIGHTLY, + combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(), + large_depth_multipliers), + framework::dataset::make("DataType", DataType::F16)), + framework::dataset::make("DataLayout", { DataLayout::NHWC }))) +{ + validate(CLAccessor(_target), _reference, tolerance_f16); +} +TEST_SUITE_END() // Dilation +TEST_SUITE_END() // W3x3 + +TEST_SUITE(Generic) +FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuDepthwiseConv2dFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(), + depth_multipliers), + framework::dataset::make("DataType", DataType::F16)), + framework::dataset::make("DataLayout", { DataLayout::NHWC }))) +{ + validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num); +} +FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuDepthwiseConv2dFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(), + large_depth_multipliers), + framework::dataset::make("DataType", DataType::F16)), + framework::dataset::make("DataLayout", { DataLayout::NHWC }))) +{ + validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num); +} + +TEST_SUITE(Dilation) +FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuDepthwiseConv2dFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(), + depth_multipliers), + framework::dataset::make("DataType", DataType::F16)), + framework::dataset::make("DataLayout", { DataLayout::NHWC }))) +{ + validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num); +} +FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuDepthwiseConv2dFixture, framework::DatasetMode::NIGHTLY, + combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset(), + large_depth_multipliers), + framework::dataset::make("DataType", DataType::F16)), + framework::dataset::make("DataLayout", { DataLayout::NHWC }))) +{ + validate(CLAccessor(_target), _reference, tolerance_f16, tolerance_num); +} +TEST_SUITE_END() // Dilation +TEST_SUITE_END() // Generic +TEST_SUITE_END() // FP16 + +TEST_SUITE(FP32) +TEST_SUITE(W3x3) +FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuDepthwiseConv2dFixture, framework::DatasetMode::ALL, + combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset3x3(), + depth_multipliers), + framework::dataset::make("DataType", DataType::F32)), + framework::dataset::make("DataLayout", DataLayout::NHWC))) +{ + validate(CLAccessor(_target), _reference, tolerance_f32); +} +FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuDepthwiseConv2dFixture, framework::DatasetMode::NIGHTLY, + combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(), + large_depth_multipliers), + framework::dataset::make("DataType", DataType::F32)), + framework::dataset::make("DataLayout", DataLayout::NHWC))) +{ + validate(CLAccessor(_target), _reference, tolerance_f32); +} + +TEST_SUITE(Dilation) + +FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuDepthwiseConv2dFixture, framework::DatasetMode::ALL, + combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset3x3(), + depth_multipliers), + framework::dataset::make("DataType", DataType::F32)), + framework::dataset::make("DataLayout", DataLayout::NHWC))) +{ + validate(CLAccessor(_target), _reference, tolerance_f32); +} +FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuDepthwiseConv2dFixture, framework::DatasetMode::NIGHTLY, + combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(), + large_depth_multipliers), + framework::dataset::make("DataType", DataType::F32)), + framework::dataset::make("DataLayout", DataLayout::NHWC))) +{ + validate(CLAccessor(_target), _reference, tolerance_f32); +} +TEST_SUITE_END() // Dilation +TEST_SUITE_END() // W3x3 + +TEST_SUITE(Generic) +FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuDepthwiseConv2dFixture, framework::DatasetMode::ALL, + combine(combine(combine(datasets::SmallDepthwiseConvolutionLayerDataset(), + depth_multipliers), + framework::dataset::make("DataType", DataType::F32)), + framework::dataset::make("DataLayout", { DataLayout::NHWC }))) +{ + validate(CLAccessor(_target), _reference, tolerance_f32); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuDepthwiseConv2dFixture, framework::DatasetMode::NIGHTLY, + combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(), + large_depth_multipliers), + framework::dataset::make("DataType", DataType::F32)), + framework::dataset::make("DataLayout", { DataLayout::NHWC }))) +{ + validate(CLAccessor(_target), _reference, tolerance_f32); +} + +FIXTURE_DATA_TEST_CASE(RunLargeKernelSize, DynamicFusionGpuDepthwiseConv2dFixture, framework::DatasetMode::ALL, + combine(combine(combine(datasets::LargeKernelSizeDepthwiseConvolutionLayerNHWCDataset(), + framework::dataset::make("DepthMultiplier", { 1 })), + framework::dataset::make("DataType", DataType::F32)), + framework::dataset::make("DataLayout", { DataLayout::NHWC }))) +{ + validate(CLAccessor(_target), _reference, tolerance_f32); +} + +TEST_SUITE(Dilation) +FIXTURE_DATA_TEST_CASE(RunSmall, DynamicFusionGpuDepthwiseConv2dFixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(), + depth_multipliers), + framework::dataset::make("DataType", DataType::F32)), + framework::dataset::make("DataLayout", { DataLayout::NHWC }))) +{ + validate(CLAccessor(_target), _reference, tolerance_f32); +} +FIXTURE_DATA_TEST_CASE(RunLarge, DynamicFusionGpuDepthwiseConv2dFixture, framework::DatasetMode::NIGHTLY, + combine(combine(combine(datasets::LargeDepthwiseDilatedConvolutionLayerDataset3x3(), + large_depth_multipliers), + framework::dataset::make("DataType", DataType::F32)), + framework::dataset::make("DataLayout", { DataLayout::NHWC }))) +{ + validate(CLAccessor(_target), _reference, tolerance_f32); +} +TEST_SUITE_END() // Dilation +TEST_SUITE_END() // Generic +TEST_SUITE_END() // FP32 +TEST_SUITE_END() // Float +TEST_SUITE_END() // DEPTHWISE_CONV2D +TEST_SUITE_END() // DYNAMIC_FUSION +TEST_SUITE_END() // CL +} // namespace validation +} // namespace test +} // namespace arm_compute diff --git a/tests/validation/fixtures/dynamic_fusion/gpu/cl/DepthwiseConv2dFixture.h b/tests/validation/fixtures/dynamic_fusion/gpu/cl/DepthwiseConv2dFixture.h new file mode 100644 index 0000000000..c7600e082e --- /dev/null +++ b/tests/validation/fixtures/dynamic_fusion/gpu/cl/DepthwiseConv2dFixture.h @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_DEPTHWISECONV2DFIXTURE +#define TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_DEPTHWISECONV2DFIXTURE + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h" +#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h" +#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h" +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h" + +#include "tests/CL/CLAccessor.h" + +#include "tests/framework/Asserts.h" +#include "tests/framework/Fixture.h" +#include "tests/framework/Macros.h" + +#include "tests/validation/Validation.h" +#include "tests/validation/reference/DepthwiseConvolutionLayer.h" + +using namespace arm_compute::experimental::dynamic_fusion; + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +template +class DynamicFusionGpuDepthwiseConv2dValidationGenericFixture : public framework::Fixture +{ +public: + using TBias = typename std::conditional < std::is_same::type, uint8_t>::value + || std::is_same::type, int8_t>::value, + int32_t, T >::type; // If T: uint8_t or int8_t then TBias: int32_t, otherwise TBias: T + + template + void setup(TensorShape input_shape, Size2D kernel_size, const PadStrideInfo &pad_stride, const Size2D &dilation, + const unsigned int depth_multiplier, const DataType data_type, const DataLayout data_layout) + { + ARM_COMPUTE_ERROR_ON(data_layout != DataLayout::NHWC); // Dynamic fusion depthwise conv2d only supports NHWC layout + + DepthwiseConv2dAttributes dwc_conv2d_attr; + const Padding2D padding_2d(pad_stride.pad_left(), pad_stride.pad_right(), pad_stride.pad_top(), pad_stride.pad_bottom()); + dwc_conv2d_attr.pad(padding_2d) + .stride(Size2D(pad_stride.stride().first, pad_stride.stride().second)) + .dilation(dilation) + .depth_multiplier(depth_multiplier) + .dimension_rounding_type(pad_stride.round()); + + // Calculate Output and Weight Shapes + TensorShape weights_shape = TensorShape(kernel_size.width, kernel_size.height); + + const TensorInfo in_info(input_shape, 1, data_type); + const TensorInfo we_info(weights_shape, 1, data_type); + + const ConvolutionInfo info{ pad_stride, depth_multiplier, ActivationLayerInfo(), dilation }; + const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(in_info, we_info, info); + + weights_shape.set(2, output_shape.z()); + const TensorShape bias_shape = TensorShape(weights_shape[2]); + + _data_type = data_type; + _data_layout = data_layout; + _target = compute_target(input_shape, weights_shape, bias_shape, dwc_conv2d_attr); + _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, dwc_conv2d_attr); + } + +protected: + template + void fill(U &&tensor, int i) + { + switch(tensor.data_type()) + { + case DataType::F16: + { + arm_compute::utils::uniform_real_distribution_16bit distribution{ -1.0f, 1.0f }; + library->fill(tensor, distribution, i); + break; + } + case DataType::F32: + { + std::uniform_real_distribution distribution(-1.0f, 1.0f); + library->fill(tensor, distribution, i); + break; + } + default: + library->fill_tensor_uniform(tensor, i); + } + } + + // Given input is in nchw format + TensorType compute_target(TensorShape input_shape, TensorShape weights_shape, const TensorShape &bias_shape, const DepthwiseConv2dAttributes dwc_conv2d_attr) + { + ARM_COMPUTE_ERROR_ON(_data_layout != DataLayout::NHWC); + + // Our test shapes are assumed in NCHW data layout, thus the permutation + permute(input_shape, PermutationVector(2U, 0U, 1U)); + permute(weights_shape, PermutationVector(2U, 0U, 1U)); + + // Create a new workload sketch + auto cl_compile_ctx = CLKernelLibrary::get().get_compile_context(); + auto gpu_ctx = GpuWorkloadContext{ &cl_compile_ctx }; + GpuWorkloadSketch sketch{ &gpu_ctx }; + + // Create sketch tensors + auto input_info = sketch.create_tensor_info(TensorInfo(input_shape, 1, _data_type, _data_layout)); + auto weight_info = sketch.create_tensor_info(TensorInfo(weights_shape, 1, _data_type, _data_layout)); + auto bias_info = sketch.create_tensor_info(TensorInfo(bias_shape, 1, _data_type, _data_layout)); + auto dst_info = sketch.create_tensor_info(); + FunctionType::create_op(sketch, &input_info, &weight_info, &bias_info, &dst_info, dwc_conv2d_attr); + + // Configure runtime + ClWorkloadRuntime runtime; + runtime.configure(sketch); + + // (Important) Allocate auxiliary tensor memory if there are any + for(auto &data : runtime.get_auxiliary_tensors()) + { + auto tensor = data.first; + const auto aux_mem_req = data.second; + tensor->allocator()->init(*data.first->info(), aux_mem_req.alignment); + tensor->allocator()->allocate(); + } + + // Construct user tensors + TensorType t_input{}; + TensorType t_weight{}; + TensorType t_bias{}; + TensorType t_dst{}; + + // Initialize user tensors + t_input.allocator()->init(input_info); + t_weight.allocator()->init(weight_info); + t_bias.allocator()->init(bias_info); + t_dst.allocator()->init(dst_info); + + // Allocate and fill user tensors + t_input.allocator()->allocate(); + t_weight.allocator()->allocate(); + t_bias.allocator()->allocate(); + t_dst.allocator()->allocate(); + + fill(AccessorType(t_input), 0); + fill(AccessorType(t_weight), 1); + fill(AccessorType(t_bias), 2); + + // Run runtime + runtime.run({ &t_input, &t_weight, &t_bias, &t_dst }); + return t_dst; + } + + SimpleTensor compute_reference(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, + const TensorShape &output_shape, DepthwiseConv2dAttributes dwc_conv2d_attr) + { + // Create reference + SimpleTensor src{ input_shape, _data_type, 1 }; + SimpleTensor weight{ weights_shape, _data_type, 1 }; + SimpleTensor bias{ bias_shape, _data_type, 1 }; + + fill(src, 0); + fill(weight, 1); + fill(bias, 2); + + auto src_nchw = src; + auto weights_nchw = weight; + auto bias_nchw = bias; + auto output_shape_nchw = output_shape; + + PadStrideInfo legacy_pad_stride(dwc_conv2d_attr.stride().x(), dwc_conv2d_attr.stride().y(), dwc_conv2d_attr.pad().left, dwc_conv2d_attr.pad().right, dwc_conv2d_attr.pad().top, + dwc_conv2d_attr.pad().bottom, + DimensionRoundingType{}); + auto dst_nchw = reference::depthwise_convolution(src_nchw, weights_nchw, bias_nchw, output_shape_nchw, legacy_pad_stride, dwc_conv2d_attr.depth_multiplier(), dwc_conv2d_attr.dilation()); + return dst_nchw; + } + + TensorType _target{}; + SimpleTensor _reference{}; + DataType _data_type{}; + DataLayout _data_layout{}; +}; + +template +class DynamicFusionGpuDepthwiseConv2dValidationFixture : public DynamicFusionGpuDepthwiseConv2dValidationGenericFixture +{ +public: + template + void setup(TensorShape input_shape, Size2D kernel_size, const PadStrideInfo &info, const Size2D &dilation, const unsigned int depth_multiplier, DataType data_type, DataLayout data_layout) + { + DynamicFusionGpuDepthwiseConv2dValidationGenericFixture::setup(input_shape, kernel_size, info, dilation, + depth_multiplier, data_type, data_layout); + } +}; +} // namespace validation +} // namespace test +} // namespace arm_compute +#endif /* TESTS_VALIDATION_FIXTURES_DYNAMIC_FUSION_GPU_CL_DEPTHWISECONV2DFIXTURE */ diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h index 8b50e9d1ef..2ca7ab9b7f 100644 --- a/utils/TypePrinter.h +++ b/utils/TypePrinter.h @@ -39,6 +39,7 @@ #include "arm_compute/core/experimental/IPostOp.h" #include "arm_compute/core/experimental/PostOps.h" #include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h" +#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h" #include "arm_compute/runtime/CL/CLTunerTypes.h" #include "arm_compute/runtime/CL/CLTypes.h" #include "arm_compute/runtime/FunctionDescriptors.h" @@ -3412,7 +3413,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const experimental::dynami << "[" << "Padding=" << conv2d_attr.pad() << ", " << "Size2D=" << conv2d_attr.stride() << ", " - << "Dialation=" << conv2d_attr.dilation() << "]"; + << "Dilation=" << conv2d_attr.dilation() << "]"; return os; } @@ -3428,6 +3429,39 @@ inline std::string to_string(const experimental::dynamic_fusion::Conv2dAttribute str << conv2d_attr; return str.str(); } + +/** Formatted output of the arm_compute::experimental::dynamic_fusion::DepthwiseConv2dAttributes type. + * + * @param[out] os Output stream. + * @param[in] dw_conv2d_attr arm_compute::experimental::dynamic_fusion::DepthwiseConv2dAttributes type to output. + * + * @return Modified output stream. + */ +inline ::std::ostream &operator<<(::std::ostream &os, const experimental::dynamic_fusion::DepthwiseConv2dAttributes &dw_conv2d_attr) +{ + os << "DepthwiseConv2dAttributes=" + << "[" + << "Padding=" << dw_conv2d_attr.pad() << ", " + << "Size2D=" << dw_conv2d_attr.stride() << ", " + << "Depth Multiplier=" << dw_conv2d_attr.depth_multiplier() << ", " + << "Dilation=" << dw_conv2d_attr.dilation() << "," + << "DimensionRoundingType: " << dw_conv2d_attr.dimension_rounding_type() << "]"; + + return os; +} +/** Formatted output of the arm_compute::experimental::dynamic_fusion::DepthwiseConv2dAttributes type. + * + * @param[in] dw_conv2d_attr arm_compute::experimental::dynamic_fusion::DepthwiseConv2dAttributes type to output. + * + * @return Formatted string. + */ +inline std::string to_string(const experimental::dynamic_fusion::DepthwiseConv2dAttributes &dw_conv2d_attr) +{ + std::stringstream str; + str << dw_conv2d_attr; + return str.str(); +} + } // namespace arm_compute #endif /* __ARM_COMPUTE_TYPE_PRINTER_H__ */ -- cgit v1.2.1