From 7dc0234331f2150a6b4ac5c2b49de419870f7cf5 Mon Sep 17 00:00:00 2001 From: Gunes Bayir Date: Mon, 21 Nov 2022 21:46:50 +0000 Subject: Implement FP32/16 Depthwise Conv2d operator in dynamic fusion This patch adds Depthwise Conv2d operator into dynamic fusion interface and adds the associated tests. Resolves: COMPMID-5517 Change-Id: I385c94dff7fd40c72b8337ef797e508df4499a82 Signed-off-by: Gunes Bayir Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8678 Tested-by: Arm Jenkins Reviewed-by: SiCong Li Reviewed-by: Gian Marco Iodice Benchmark: Arm Jenkins --- src/dynamic_fusion/sketch/OperatorAttributes.cpp | 1 - .../attributes/DepthwiseConv2dAttributes.cpp | 85 +++++ .../components/cl/ClComponentDepthwiseConv2d.cpp | 220 ++++++++++++ .../gpu/components/cl/ClComponentDepthwiseConv2d.h | 171 ++++++++++ .../gpu/components/cl/ClComponentDirectConv2d.cpp | 10 +- .../sketch/gpu/operators/GpuDepthwiseConv2d.cpp | 346 +++++++++++++++++++ .../cl/ClTemplateDepthwiseConv2d.cpp | 378 +++++++++++++++++++++ .../template_writer/cl/ClTemplateDepthwiseConv2d.h | 111 ++++++ .../template_writer/cl/ClTemplateDirectConv2d.cpp | 26 +- .../gpu/template_writer/cl/ClTemplateStore.cpp | 23 +- 10 files changed, 1325 insertions(+), 46 deletions(-) create mode 100644 src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp create mode 100644 src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp create mode 100644 src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h create mode 100644 src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp create mode 100644 src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp create mode 100644 src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h (limited to 'src/dynamic_fusion/sketch') diff --git a/src/dynamic_fusion/sketch/OperatorAttributes.cpp b/src/dynamic_fusion/sketch/OperatorAttributes.cpp index 51ec444587..205ce687a3 100644 --- a/src/dynamic_fusion/sketch/OperatorAttributes.cpp +++ b/src/dynamic_fusion/sketch/OperatorAttributes.cpp @@ -57,7 +57,6 @@ Size2D Conv2dAttributes::dilation() const { return _dilation; } - } // namespace dynamic_fusion } // namespace experimental } // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp b/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp new file mode 100644 index 0000000000..3a5657e07b --- /dev/null +++ b/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::pad(const Padding2D &pad) +{ + _pad = pad; + return *this; +} +Padding2D DepthwiseConv2dAttributes::pad() const +{ + return _pad; +} +DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::stride(const Size2D &stride) +{ + _stride = stride; + return *this; +} +Size2D DepthwiseConv2dAttributes::stride() const +{ + return _stride; +} +DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::dilation(const Size2D &dilation) +{ + _dilation = dilation; + return *this; +} +Size2D DepthwiseConv2dAttributes::dilation() const +{ + return _dilation; +} + +DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::depth_multiplier(const uint32_t &depth_multiplier) +{ + _depth_multiplier = depth_multiplier; + return *this; +} + +uint32_t DepthwiseConv2dAttributes::depth_multiplier() const +{ + return _depth_multiplier; +} + +DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::dimension_rounding_type(const DimensionRoundingType &dimension_rounding_type) +{ + _dimension_rounding_type = dimension_rounding_type; + return *this; +} + +DimensionRoundingType DepthwiseConv2dAttributes::dimension_rounding_type() const +{ + return _dimension_rounding_type; +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp new file mode 100644 index 0000000000..5626093079 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClComponentDepthwiseConv2d.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h" +#include "src/core/CL/CLValidate.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +using Settings = ClComponentDepthwiseConv2dSettings; + +Settings &Settings::export_input_to_cl_image(bool cl_image) +{ + _export_input_to_cl_image = cl_image; + return *this; +} + +bool Settings::export_input_to_cl_image() const +{ + return _export_input_to_cl_image; +} + +Settings &Settings::export_weights_to_cl_image(bool cl_image) +{ + _export_weights_to_cl_image = cl_image; + return *this; +} + +bool Settings::export_weights_to_cl_image() const +{ + return _export_weights_to_cl_image; +} + +Settings &Settings::fast_relaxed_math(bool fast_relaxed_math) +{ + _fast_relaxed_math = fast_relaxed_math; + return *this; +} + +bool Settings::fast_relaxed_math() const +{ + return _fast_relaxed_math; +} + +Settings &Settings::is_fma_available(bool is_fma_available) +{ + _is_fma_available = is_fma_available; + return *this; +} + +bool Settings::is_fma_available() const +{ + return _is_fma_available; +} + +Settings &Settings::n0(unsigned int n0) +{ + _n0 = n0; + return *this; +} + +unsigned int Settings::n0() const +{ + return _n0; +} + +Settings &Settings::m0(unsigned int m0) +{ + _m0 = m0; + return *this; +} + +unsigned int Settings::m0() const +{ + return _m0; +} + +Status ClComponentDepthwiseConv2d::validate( + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings) +{ + ARM_COMPUTE_UNUSED(properties, settings); + const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); + const auto wei = tensors.get_const_tensor(TensorType::ACL_SRC_1); + const auto bia = tensors.get_const_tensor(TensorType::ACL_SRC_2); + const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0); + + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei, dst); + + // 1. Check validity + // Matching data type + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + if(bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia); + } + + // Matching data layout + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); + if(bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia); + } + + // All tensor infos are initialized + ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); + if(bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0); + } + // Device requirements are met + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + // wei shape is correct + const DataLayout data_layout = src->data_layout(); + const size_t channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) != (src->dimension(channel_idx) * attributes.depth_multiplier())); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 3, "Weights can be at most 3 dimensional"); + + // dst shape is correct + const PadStrideInfo pad_stride_info = PadStrideInfo(attributes.stride().x(), attributes.stride().y(), + attributes.pad().left, attributes.pad().right, + attributes.pad().top, attributes.pad().bottom, + attributes.dimension_rounding_type()); + const ConvolutionInfo conv_info{ pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(), attributes.dilation() }; + const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape); + + // Check strides and dilation + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first < 1); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().second < 1); + ARM_COMPUTE_RETURN_ERROR_ON((conv_info.dilation.x() < 1) || (conv_info.dilation.y() < 1)); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && settings.m0() != 1); + ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && settings.m0() != 1); + + if(conv_info.depth_multiplier > 1 && settings.n0() > 1) + { + ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % settings.n0()) != 0); + } + + // Check export weights to cl image + ARM_COMPUTE_RETURN_ERROR_ON_MSG((settings.export_weights_to_cl_image() == true) && (export_to_cl_image(wei) == false), "Weights cannot be exported to cl_image!"); + ARM_COMPUTE_RETURN_ERROR_ON((settings.export_weights_to_cl_image() == true) && ((settings.n0() % 4) != 0)); + + ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) != (src->dimension(channel_idx) * conv_info.depth_multiplier)); + + // bia shape is correct + if(bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != output_shape[channel_idx], + "Biases size and number of dst feature maps should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1, "Biases should be one dimensional"); + } + + // 2. Check support level + // Data type + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + // Data layout + ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); + // Texture in the input tensor + ARM_COMPUTE_RETURN_ERROR_ON((settings.export_input_to_cl_image() == true)); + + return Status{}; +} + +ClComponentDepthwiseConv2d::ClComponentDepthwiseConv2d( + ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuKernelComponent{ id, properties, tensors }, + _component_writer{ std::make_unique(id, tensors, attributes, settings) } +{ +} +ClComponentDepthwiseConv2d::~ClComponentDepthwiseConv2d() +{ +} +const IGpuTemplateComponentWriter *ClComponentDepthwiseConv2d::template_writer() const +{ + return _component_writer.get(); +} +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h new file mode 100644 index 0000000000..0e2b5f14cb --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D + +#include "arm_compute/core/Error.h" +#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" +#include + +namespace arm_compute +{ +/** Forward declaration */ +class ITensorInfo; +namespace experimental +{ +namespace dynamic_fusion +{ +/** Forward declaration */ +template +class ArgumentPack; +class DepthwiseConv2dAttributes; + +/** Component specific settings + */ +class ClComponentDepthwiseConv2dSettings +{ +public: + /** Set export_input_to_cl_image flag */ + ClComponentDepthwiseConv2dSettings &export_input_to_cl_image(bool cl_image); + /** Get export_input_to_cl_image flag */ + bool export_input_to_cl_image() const; + + /** Set export_weights_to_cl_image flag */ + ClComponentDepthwiseConv2dSettings &export_weights_to_cl_image(bool cl_image); + /** Get export_weights_to_cl_image flag */ + bool export_weights_to_cl_image() const; + + /** Set fast_relaxed_math flag */ + ClComponentDepthwiseConv2dSettings &fast_relaxed_math(bool fast_relaxed_math); + /** Get fast_relaxed_math flag */ + bool fast_relaxed_math() const; + + /** Set is_fma_available flag */ + ClComponentDepthwiseConv2dSettings &is_fma_available(bool is_fma_available); + /** Get is_fma_available flag */ + bool is_fma_available() const; + + /** Set N0: number of columns processed by each thread */ + ClComponentDepthwiseConv2dSettings &n0(unsigned int n0); + /** Get N0: number of columns processed by each thread */ + unsigned int n0() const; + + /** Set M0: number of rows processed by each thread */ + ClComponentDepthwiseConv2dSettings &m0(unsigned int m0); + /** Set M0: number of rows processed by each thread */ + unsigned int m0() const; + +private: + bool _export_input_to_cl_image{ false }; /**< Export input to cl_image */ + bool _export_weights_to_cl_image{ false }; /**< Export the weights to cl_image */ + bool _fast_relaxed_math{ true }; /**< Enable/disable -cl-fast-relaxed-math flag */ + bool _is_fma_available{ false }; /**< Is fma instruction available */ + unsigned int _n0{ 0 }; /**< Number of columns processed by each thread */ + unsigned int _m0{ 0 }; /**< Number of rows processed by each thread */ +}; + +/** Forward declaration */ +class ClTemplateDepthwiseConv2d; + +class ClComponentDepthwiseConv2d final : public IGpuKernelComponent +{ +public: + /** Attributes are a set of backend-agnostic parameters that define what a component does */ + using Attributes = DepthwiseConv2dAttributes; + /** Settings are a set of backend-specific parameters that influence the implementation of a component */ + using Settings = ClComponentDepthwiseConv2dSettings; + +public: + /** Validate the component + * + * @param[in] properties Component properties @ref Properties + * @param[in,out] tensors Tensor arguments to the component + * @param[in] attributes Component attributes @ref Attributes + * @param[in] settings Component settings @ref Settings + * + * @return Status Validation results + * + * Tensor argument names: + * - ACL_SRC_0: Input + * - ACL_SRC_1: Weight + * - ACL_SRC_2: Bias (Optional) + * - ACL_DST_0: Output + * + * Tensor argument constness: + * - ACL_SRC_0: Const + * - ACL_SRC_1: Const + * - ACL_SRC_2: Const + * - ACL_DST_0: Const + * + * Valid data layouts: + * - NHWC + * + * Valid data type configurations: + * |ACL_SRC_0 |ACL_SRC_1 |ACL_SRC_2 |ACL_DST_0 | + * |:--------------|:--------------|:--------------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + */ + static Status validate( + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings); + + /** Constructor + * + * Similar to @ref ClComponentDepthwiseConv2d::validate() + */ + ClComponentDepthwiseConv2d( + ComponentId id, + const Properties &properties, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings); + + /** Destructor */ + ~ClComponentDepthwiseConv2d() override; + /** Prevent instances of this class from being copy constructed */ + ClComponentDepthwiseConv2d(const ClComponentDepthwiseConv2d &component) = delete; + /** Prevent instances of this class from being copied */ + ClComponentDepthwiseConv2d &operator=(const ClComponentDepthwiseConv2d &component) = delete; + /** Allow instances of this class to be move constructed */ + ClComponentDepthwiseConv2d(ClComponentDepthwiseConv2d &&component) = default; + /** Allow instances of this class to be moved */ + ClComponentDepthwiseConv2d &operator=(ClComponentDepthwiseConv2d &&component) = default; + /** Get template writer for the component */ + const IGpuTemplateComponentWriter *template_writer() const override; + /** Get component type */ + GpuComponentType type() const override + { + return GpuComponentType::Complex; + } + +private: + std::unique_ptr _component_writer; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D */ diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp index e94cfd1581..dc05825500 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp @@ -35,26 +35,24 @@ namespace experimental { namespace dynamic_fusion { -using Settings = ClComponentDirectConv2dSettings; - -Settings &Settings::export_to_cl_image(bool cl_image) +ClComponentDirectConv2dSettings &ClComponentDirectConv2dSettings::export_to_cl_image(bool cl_image) { _export_to_cl_image = cl_image; return *this; } -bool Settings::export_to_cl_image() const +bool ClComponentDirectConv2dSettings::export_to_cl_image() const { return _export_to_cl_image; } -Settings &Settings::fast_relaxed_math(bool fast_relaxed_math) +ClComponentDirectConv2dSettings &ClComponentDirectConv2dSettings::fast_relaxed_math(bool fast_relaxed_math) { _fast_relaxed_math = fast_relaxed_math; return *this; } -bool Settings::fast_relaxed_math() const +bool ClComponentDirectConv2dSettings::fast_relaxed_math() const { return _fast_relaxed_math; } diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp new file mode 100644 index 0000000000..89f1e999b8 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h" + +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/dynamic_fusion/sketch/ArgumentPack.h" +#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h" +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +namespace +{ +bool export_weights_to_cl_image_heuristic(const ITensorInfo *weights, unsigned int depth_multiplier, GPUTarget gpu_target) +{ + if(!export_to_cl_image(weights)) + { + return false; + } + + const size_t idx_w = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); + const size_t kernel_w = weights->tensor_shape()[idx_w]; + const size_t kernel_h = weights->tensor_shape()[idx_h]; + + if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD) + { + return false; + } + + if((kernel_w == 1) && (kernel_h == 1)) + { + return false; + } + + if(depth_multiplier > 1) + { + if((depth_multiplier % 4) != 0) + { + return false; + } + } + + return true; +} + +void initialize_dwc_native_compute_info(DWCComputeKernelInfo &dwc_compute_info, const ITensorInfo *input, const ITensorInfo *weights, + const DepthwiseConv2dAttributes &attributes, const GPUTarget gpu_target) +{ + const unsigned int depth_multiplier = attributes.depth_multiplier(); + + // Floating point path + // First check if we can export to cl_image. + dwc_compute_info.export_input_to_cl_image = false; + dwc_compute_info.export_weights_to_cl_image = export_weights_to_cl_image_heuristic(weights, depth_multiplier, gpu_target); + + // Set n0 + if(depth_multiplier == 1) + { + if(dwc_compute_info.export_weights_to_cl_image == false && weights->data_type() == DataType::F16) + { + dwc_compute_info.n0 = 8; + } + else + { + dwc_compute_info.n0 = 4; + } + } + else + { + if((depth_multiplier % 4) == 0) + { + dwc_compute_info.n0 = 4; + } + else if((depth_multiplier % 2) == 0) + { + dwc_compute_info.n0 = 2; + } + else + { + dwc_compute_info.n0 = 1; + } + } + + dwc_compute_info.n0 = adjust_vec_size(dwc_compute_info.n0, weights->dimension(0)); + + // Set m0 only if stride_x == 1 and dilation_x == 1 + if(attributes.stride().x() == 1 && attributes.dilation().x() == 1) + { + const size_t idx_w = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const size_t kernel_w = weights->tensor_shape()[idx_w]; + + if((kernel_w >= 9) || (kernel_w == 1)) + { + dwc_compute_info.m0 = 1; + } + else + { + if(weights->data_type() == DataType::F16) + { + if((input->dimension(1) % 5) == 0) + { + dwc_compute_info.m0 = 5; + } + else + { + dwc_compute_info.m0 = 4; + } + } + else + { + dwc_compute_info.m0 = 2; + } + } + } + else + { + dwc_compute_info.m0 = 1; + } + return; +} + +void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ITensorInfo *wei, const DepthwiseConv2dAttributes &attributes) +{ + if(dst->total_size() == 0U) + { + const PadStrideInfo pad_stride_info(attributes.stride().x(), + attributes.stride().y(), + attributes.pad().left, + attributes.pad().right, + attributes.pad().top, + attributes.pad().bottom, + attributes.dimension_rounding_type()); + + const ConvolutionInfo conv_info{ pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(), attributes.dilation() }; + const TensorShape shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info); + + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape)); + } +} + +constexpr GpuOperatorType operator_type = GpuOperatorType::Complex; +} // namespace + +Status GpuDepthwiseConv2d::is_supported_op(const GpuWorkloadContext &context, + const ITensorInfo *src, + const ITensorInfo *wei, + const ITensorInfo *bia, + const ITensorInfo *dst, + const DepthwiseConv2dAttributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei, dst); + + // Auto initialize dst tensor info + TensorInfo dst_info_to_validate = *dst; + calculate_and_init_dst_if_empty(&dst_info_to_validate, src, wei, attributes); + + // Check support level + // Data type + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + // Data layout + ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); + + const GpuTarget gpu_target = context.gpu_target(); + + if(context.gpu_language() == GpuLanguage::OpenCL) + { + const CLCompileContext *cl_compile_ctx = context.cl_compile_context(); + ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr); + + // Validate Depthwise Conv2d Component + { + const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + auto settings = ClComponentDepthwiseConv2d::Settings(); + + DWCComputeKernelInfo dwc_info; + initialize_dwc_native_compute_info(dwc_info, src, wei, attributes, gpu_target); + + settings.fast_relaxed_math( + (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) + && (dst_info_to_validate.data_type() == DataType::F32 || dst_info_to_validate.data_type() == DataType::F16)); + + settings.is_fma_available(get_arch_from_target(gpu_target) == GPUTarget::MIDGARD) + .m0(dwc_info.m0) + .n0(dwc_info.n0) + .export_input_to_cl_image(dwc_info.export_input_to_cl_image) + .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image); + + ArgumentPack arguments; + arguments.add_const_tensor(ACL_SRC_0, src); + arguments.add_const_tensor(ACL_SRC_1, wei); + arguments.add_const_tensor(ACL_SRC_2, bia); + arguments.add_const_tensor(ACL_DST_0, &dst_info_to_validate); + ARM_COMPUTE_RETURN_ON_ERROR(ClComponentDepthwiseConv2d::validate(properties, arguments, attributes, settings)); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_MSG("Unimplemented Gpu language"); + } + + return Status{}; +} + +Status GpuDepthwiseConv2d::validate_op(const GpuWorkloadSketch &sketch, + const ITensorInfo *src, + const ITensorInfo *wei, + const ITensorInfo *bia, + const ITensorInfo *dst, + const DepthwiseConv2dAttributes &attributes) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei, dst); + ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !wei->has_valid_id() || !dst->has_valid_id()); + + if(bia != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id()); + } + + // Auto initialize dst tensor info + TensorInfo dst_info_to_validate = *dst; + calculate_and_init_dst_if_empty(&dst_info_to_validate, src, wei, attributes); + + // Perform fusion test + // Pack tensor infos + ArgumentPack tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_const_tensor(ACL_SRC_1, wei); + tensors.add_const_tensor(ACL_SRC_2, bia); + tensors.add_const_tensor(ACL_DST_0, &dst_info_to_validate); + const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!sketch.implementation().operator_group().try_add_operator(op), + "Operator fusion test failed. This operator cannot be fused into the workload"); + + // Check if configuration is supported + return is_supported_op(*sketch.gpu_context(), src, wei, bia, &dst_info_to_validate, attributes); +} + +void GpuDepthwiseConv2d::create_op(GpuWorkloadSketch &sketch, + ITensorInfo *src, + ITensorInfo *wei, + ITensorInfo *bia, + ITensorInfo *dst, + const DepthwiseConv2dAttributes &attributes) +{ + // Assert validation + ARM_COMPUTE_ERROR_THROW_ON(GpuDepthwiseConv2d::validate_op(sketch, src, wei, bia, dst, attributes)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, wei, dst); + ARM_COMPUTE_LOG_PARAMS(src, wei, bia, dst, attributes); + + calculate_and_init_dst_if_empty(dst, src, wei, attributes); + + // Translate into components and add to component graph + GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph(); + const auto *sketch_ctx = sketch.implementation().context(); + const GpuTarget gpu_target = sketch_ctx->gpu_target(); + + if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) + { + const auto cl_compile_ctx = sketch_ctx->cl_compile_context(); + ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr); + + // Add Depthwise Conv2d Component + { + const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); + auto settings = ClComponentDepthwiseConv2d::Settings(); + + DWCComputeKernelInfo dwc_info; + initialize_dwc_native_compute_info(dwc_info, src, wei, attributes, gpu_target); + + settings.is_fma_available(get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) + .m0(dwc_info.m0) + .n0(dwc_info.n0) + .export_input_to_cl_image(dwc_info.export_input_to_cl_image) + .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image); + + if(settings.export_input_to_cl_image()) + { + arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(src); + } + + if(settings.export_weights_to_cl_image()) + { + arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei); + } + + ArgumentPack arguments; + arguments.add_const_tensor(ACL_SRC_0, src); + arguments.add_const_tensor(ACL_SRC_1, wei); + arguments.add_const_tensor(ACL_SRC_2, bia); + arguments.add_const_tensor(ACL_DST_0, dst); + comp_graph.add_new_component(properties, arguments, attributes, settings); + } + } + else + { + ARM_COMPUTE_ERROR("Unimplemented Gpu language"); + } + + // Set up fusion test by adding to the Operator Group + // Note this has to be performed after all the components have been successfully added to the component graph + + // Pack tensor infos + ArgumentPack tensors; + tensors.add_const_tensor(ACL_SRC_0, src); + tensors.add_const_tensor(ACL_SRC_1, wei); + tensors.add_const_tensor(ACL_SRC_2, bia); + tensors.add_const_tensor(ACL_DST_0, dst); + + const Operator op = sketch.implementation().operator_group().new_operator(operator_type, tensors); + sketch.implementation().operator_group().add_operator(op); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp new file mode 100644 index 0000000000..389bd5c65f --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp @@ -0,0 +1,378 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "ClTemplateDepthwiseConv2d.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +ClTemplateDepthwiseConv2d::ClTemplateDepthwiseConv2d(ComponentId id, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings) + : IGpuTemplateComponentWriter{ id, tensors }, + _src{}, + _weight{}, + _bias{}, + _dst{}, + _attributes{ attributes }, + _settings{ settings } +{ + _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0); + _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1); + if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2)) + { + _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2); + } + _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0); + ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _weight, _dst); +} + +std::string ClTemplateDepthwiseConv2d::get_name() const +{ + return "depthwise_conv2d"; +} + +std::string ClTemplateDepthwiseConv2d::get_component_code(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + + constexpr int height_idx = 2; // Data Layout is NHWC + + std::string code = R"_( +//------------------ START KERNEL {{meta_kernel_id}} --------------------- +// IN_0(src) {{src}} +// IN_1(wei) {{weight}} +)_"; + + if(_bias != nullptr && _bias->has_valid_id()) + { + code += R"_( +// IN_1(bia) {{bias}} +)_"; + } + + code += R"_( +// OUT(dst, accum) {{dst}} + +TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}}); +TILE(uint, M0, 1, g_dst_indirect_y); + +{ +#define _IWEI_WIDTH {{WEI_WIDTH}} +#define _IWEI_HEIGHT {{WEI_HEIGHT}} +#define _IDST_WIDTH {{arg_dst}}_w +#define _IDST_HEIGHT {{arg_dst}}_h +#define _IM0_A M0_A +#define _IN0_A N0_A +#define _IM0_B _IWEI_WIDTH +#define _IN0_B N0 +#define _IBOUNDARY_CHECK (!((_IWEI_WIDTH == 1 && _IWEI_HEIGHT == 1 && {{PAD_LEFT}} == 0 && {{PAD_TOP}} == 0 && M0 == 1))) +)_"; + + code += R"_( + const int yo = g_ind_2 % {{arg_dst}}_h; + const int bout = g_ind_2 / {{arg_dst}}_h; +)_"; + + code += R"_( + + int xi = g_ind_1 * {{STRIDE_X}}; + int yi = yo * {{STRIDE_Y}}; + xi -= {{PAD_LEFT}}; + yi -= {{PAD_TOP}}; + + LOOP_UNROLLING(int, i, 0, 1, M0, + { + {{dst}}[i].v = 0; + }) +)_"; + + if(_weight->dimension(height_idx) < 5) + { + code += R"_( + LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT, +)_"; + } + else + { + code += R"_( + for(int yk = 0; yk < _IWEI_HEIGHT; ++yk) +)_"; + } + + code += R"_( + { + TILE({{SRC_DATA_TYPE}}, _IM0_A, _IN0_A, a); + + LOOP_UNROLLING(int, i, 0, 1, _IM0_A, + { + a[i].v = 0; + }) + + T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, _IM0_A, _IN0_A, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi + yk * {{DILATION_Y}}, xi, (g_ind_0 / {{DEPTH_MULTIPLIER}}), {{src}}_w, {{src}}_h, {{DILATION_X}}, 1, _IBOUNDARY_CHECK, a); + + TILE({{WEI_DATA_TYPE}}, _IM0_B, _IN0_B, b); + + T_LOAD({{WEI_DATA_TYPE}}, _IM0_B, _IN0_B, {{WEI_TENSOR_TYPE}}, {{weight}}, g_ind_0, yk * _IM0_B, 1, {{weight}}_stride_y, b); + + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH, + { +)_"; + + if(!_settings.is_fma_available()) + { + code += R"_( + {{dst}}[m0].v += a[xk + m0].v * b[xk].v; +)_"; + } + else + { + code += R"_( + {{dst}}[m0].v = fma(a[xk + m0].v, b[xk].v, {{dst}}[m0].v); +)_"; + } + + code += R"_( + }) + }) + } +)_"; + + if(_weight->dimension(height_idx) < 5) + { + code += R"_( + ) +)_"; + } + + if(_bias && _bias->has_valid_id()) + { + code += R"_( + TILE({{BIA_DATA_TYPE}}, 1, N0, {{bias}}); + + T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 0, 0, {{bias}}); + + T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, {{bias}}, {{dst}}); +)_"; + } + + code += R"_( + LOOP_UNROLLING(int, i, 0, 1, M0, + { + g_dst_indirect_y[i].v = (uint)min((int)(g_ind_1 + i), (int)({{arg_dst}}_w) - 1); + g_dst_indirect_y[i].v += (int)(g_ind_2 % {{arg_dst}}_h) * (int)({{arg_dst}}_w); + g_dst_indirect_y[i].v += (int)(g_ind_2 / {{arg_dst}}_h) * (int)({{arg_dst}}_w * {{arg_dst}}_h); + }) +} +//------------------ END KERNEL {{meta_kernel_id}} --------------------- +)_"; + + return code; +} + +void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +{ + const GpuKernelArgumentInfo::Type input_type = _settings.export_input_to_cl_image() ? + GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : + GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; + + vtable.declare_variable( + _src, + GpuKernelArgumentInfo(input_type), + comp_group.is_intermediate_tensor(_src), + "src"); + + const GpuKernelArgumentInfo::Type weight_type = _settings.export_weights_to_cl_image() ? + GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : + GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer; + + vtable.declare_variable( + _weight, + GpuKernelArgumentInfo(weight_type), + comp_group.is_intermediate_tensor(_weight), + "weight"); + + if(_bias != nullptr && _bias->has_valid_id()) // optional bias + { + vtable.declare_variable( + _bias, + GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), + comp_group.is_intermediate_tensor(_bias), + "bias"); + } + vtable.declare_variable( + _dst, + GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer), + comp_group.is_intermediate_tensor(_dst), + "dst"); +} + +TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const +{ + TagLUT lut{}; + + // Arguments and global shared variables + lut["src"] = vtable.get_variable(_src); + lut["weight"] = vtable.get_variable(_weight); + + if(_bias != nullptr && _bias->has_valid_id()) // optional bias + { + lut["bias"] = vtable.get_variable(_bias); + lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type()); + } + lut["dst"] = vtable.get_variable(_dst); + + const auto dst_argument = vtable.get_variable(comp_group.get_dst_tensors()[0]); + lut["arg_dst"] = dst_argument.uniq_name; + + // Local build options + lut["meta_kernel_id"] = id(); + lut["ACC_DATA_TYPE"] = _src->data_type(); + lut["SRC_DATA_TYPE"] = _src->data_type(); + lut["WEI_DATA_TYPE"] = _weight->data_type(); + + switch(vtable.get_variable(_src).kernel_argument_info.type) + { + case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D: + case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D: + case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image: + lut["SRC_TENSOR_TYPE"] = "IMAGE"; + break; + default: + lut["SRC_TENSOR_TYPE"] = "BUFFER"; + break; + } + + switch(vtable.get_variable(_weight).kernel_argument_info.type) + { + case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D: + case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D: + case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image: + lut["WEI_TENSOR_TYPE"] = "IMAGE"; + break; + default: + lut["WEI_TENSOR_TYPE"] = "BUFFER"; + break; + } + + // Data Layout is NHWC + constexpr int width_idx = 1; + constexpr int height_idx = 2; + + lut["WEI_WIDTH"] = _weight->dimension(width_idx); + lut["WEI_HEIGHT"] = _weight->dimension(height_idx); + + lut["STRIDE_X"] = _attributes.stride().x(); + lut["STRIDE_Y"] = _attributes.stride().y(); + + lut["PAD_LEFT"] = _attributes.pad().left; + lut["PAD_TOP"] = _attributes.pad().top; + + lut["DILATION_X"] = _attributes.dilation().x(); + lut["DILATION_Y"] = _attributes.dilation().y(); + + lut["DEPTH_MULTIPLIER"] = _attributes.depth_multiplier(); + + return lut; +} + +CLBuildOptions ClTemplateDepthwiseConv2d::get_build_options(const ComponentGroup &comp_group) const +{ + ARM_COMPUTE_UNUSED(comp_group); + + constexpr unsigned int width_idx = 1; // Data Layout is NHWC + + const unsigned int n0 = _settings.n0(); + const unsigned int m0 = _settings.m0(); + const unsigned int m0_a = _weight->dimension(width_idx) + m0 - 1; + const unsigned int n0_a = _attributes.depth_multiplier() > 1 ? 1 : n0; + const unsigned int partial_store_n0 = _dst->dimension(0) % n0; + + CLBuildOptions build_opts{}; + + if(_settings.fast_relaxed_math()) + { + build_opts.add_option("-cl-fast-relaxed-math"); + } + else + { + // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations + // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations + build_opts.add_option("-cl-unsafe-math-optimizations"); + } + + build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); + build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); + build_opts.add_option("-DN0_A=" + support::cpp11::to_string(n0_a)); + build_opts.add_option("-DM0_A=" + support::cpp11::to_string(m0_a)); + build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0)); + + return build_opts; +} + +std::string ClTemplateDepthwiseConv2d::get_config_id() const +{ + std::string config_id{}; + + config_id += support::cpp11::to_string(_src->dimension(0)); + config_id += "_"; + config_id += support::cpp11::to_string(_src->dimension(1)); + config_id += "_"; + config_id += support::cpp11::to_string(_src->dimension(2)); + config_id += "_"; + config_id += support::cpp11::to_string(_dst->dimension(0)); + config_id += "_"; + config_id += support::cpp11::to_string(_dst->dimension(1)); + config_id += "_"; + config_id += support::cpp11::to_string(_dst->dimension(2)); + config_id += "_"; + config_id += string_from_data_type(_src->data_type()); + + return config_id; +} + +std::set ClTemplateDepthwiseConv2d::get_headers_list() const +{ + return std::set{ "helpers.h", "tile_helpers.h" }; +} + +Window ClTemplateDepthwiseConv2d::get_window() const +{ + ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); + + Window win = calculate_max_window(*_dst, Steps(_settings.n0(), _settings.m0())); + return win.collapse(win, Window::DimZ); +} + +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h new file mode 100644 index 0000000000..84b689ef64 --- /dev/null +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDEPTHWISECONV2D +#define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDEPTHWISECONV2D + +#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h" +#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h" +#include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h" + +namespace arm_compute +{ +namespace experimental +{ +namespace dynamic_fusion +{ +class ClTemplateDepthwiseConv2d final : public IGpuTemplateComponentWriter +{ +public: + using Attributes = ClComponentDepthwiseConv2d::Attributes; + using Settings = ClComponentDepthwiseConv2d::Settings; + /** Constructor + * + * Similar to @ref ClComponentDepthwiseConv2d::validate() + * + * @param[in] id Component id + * @param[in] tensors Tensor arguments to the components + * @param[in] attributes Component attributes + * @param[in] settings Component settings + */ + ClTemplateDepthwiseConv2d(ComponentId id, + const ArgumentPack &tensors, + const Attributes &attributes, + const Settings &settings); + /** Prevent instances of this class from being copy constructed */ + ClTemplateDepthwiseConv2d(const ClTemplateDepthwiseConv2d &depthwise_conv2d) = delete; + /** Prevent instances of this class from being copied */ + ClTemplateDepthwiseConv2d &operator=(const ClTemplateDepthwiseConv2d &depthwise_conv2d) = delete; + /** Allow instances of this class to be move constructed */ + ClTemplateDepthwiseConv2d(ClTemplateDepthwiseConv2d &&depthwise_conv2d) = default; + /** Allow instances of this class to be moved */ + ClTemplateDepthwiseConv2d &operator=(ClTemplateDepthwiseConv2d &&depthwise_conv2d) = default; + /** Generate kernel component name */ + std::string get_name() const override; + /** Generate kernel component code template + * + * @param[in] comp_group Component group of which the component is a part of + * + * @return std::string Component code + */ + std::string get_component_code(const ComponentGroup &comp_group) const override; + /** Declare all variables used by the component in the @p vtable + * + * @param[out] vtable Variable table + * @param[in] comp_group Component group of which the component is a part of + */ + void declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override; + /** Generate the tag look-up table used to instantiate the component code. + * + * @param[in] vtable Variable table + * @param[in] comp_group Component group of which the component is a part of + * + * @return TagLUT Tag lookup table + */ + TagLUT get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const override; + /** Generate the build options used in the component + * + * @param[in] comp_group Component group of which the component is a part of + * + * @return CLBuildOptions Build options + */ + CLBuildOptions get_build_options(const ComponentGroup &comp_group) const override; + /** Generate the component config id string used for tuning */ + std::string get_config_id() const override; + /** Generate the header list used in the component */ + std::set get_headers_list() const override; + /** Generate the execution window for the component */ + Window get_window() const override; + +private: + const ITensorInfo *_src; + const ITensorInfo *_weight; + const ITensorInfo *_bias; + const ITensorInfo *_dst; + Attributes _attributes; + Settings _settings; +}; +} // namespace dynamic_fusion +} // namespace experimental +} // namespace arm_compute +#endif /* SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDEPTHWISECONV2D */ diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp index 870de64eb8..7ad7dd69f0 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp @@ -86,11 +86,10 @@ std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &com code += R"_( // OUT(dst, accum) {{dst}} -// Initialize the accumulators TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}}); +TILE(uint, M0, 1, g_dst_indirect_y); + { - // All the tensor dimensions are passed at compile time. - // In case of dynamic tensor support, the following dimensions should be passed as function argument. #define _IWEI_WIDTH {{WEI_WIDTH}} #define _IWEI_HEIGHT {{WEI_HEIGHT}} #define _ISRC_WIDTH {{src}}_w @@ -101,8 +100,6 @@ TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}}); #define _IDST_CHANNELS {{arg_dst}}_c #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT) - // .v = access the whole vector (OpenCL vector) - // .s[x] = access the vector element at position x (scalar access) TILE(int, M0, 1, xi); TILE(int, M0, 1, yi); @@ -132,7 +129,6 @@ TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}}); TILE({{SRC_DATA_TYPE}}, M0, K0, a); TILE({{WEI_DATA_TYPE}}, N0, K0, b); - // Initialize tiles LOOP_UNROLLING(int, i, 0, 1, M0, { a[i].v = {{ZERO_VALUE}}; @@ -143,32 +139,24 @@ TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}}); b[i].v = {{ZERO_VALUE}}; }) - // Load tile from the src tensor T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, K0, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a); - // Load tile from the weights tensor T_LOAD({{WEI_DATA_TYPE}}, N0, K0, {{WEI_TENSOR_TYPE}}, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b); - // Compute the matrix multiplication between two tiles T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, K0, NT, T, a, b, {{dst}}); ck += K0; } - - // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS - // This #if directive should be removed in case of dynamic tensor support )_"; if(leftover_loop) { code += R"_( - // Left-over accumulations for(; k < _ISRC_CHANNELS; ++k) { TILE({{SRC_DATA_TYPE}}, M0, 1, a); TILE({{WEI_DATA_TYPE}}, N0, 1, b); - // Initialize tiles LOOP_UNROLLING(int, i, 0, 1, M0, { a[i].v = {{ZERO_VALUE}}; @@ -179,14 +167,10 @@ TILE({{ACC_DATA_TYPE}}, M0, N0, {{dst}}); b[i].v = {{ZERO_VALUE}}; }) - // Load tile from the src tensor T_LOAD_NHWC_INDIRECT({{SRC_DATA_TYPE}}, M0, 1, {{SRC_TENSOR_TYPE}}, {{src}}, g_ind_2, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, {{src}}_stride_y, xi, yi, a); - // Load tile from the weights tensor - // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration T_LOAD({{WEI_DATA_TYPE}}, N0, 1, BUFFER, {{weight}}, ck, g_ind_0 * _IY_MULTIPLIER + i, _IY_MULTIPLIER, {{weight}}_stride_y, b); - // Compute the matrix multiplication between two tiles T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}}); ++ck; @@ -215,12 +199,16 @@ code += R"_( T_LOAD({{BIA_DATA_TYPE}}, 1, N0, BUFFER, {{bias}}, g_ind_0, 0, 1, 0, bias0); - // c = c + bias[broadcasted] T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}}); )_"; } code += R"_( + LOOP_UNROLLING(int, i, 0, 1, M0, + { + g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{arg_dst}}_w * {{arg_dst}}_h) - 1); + g_dst_indirect_y[i].v += g_ind_2 * (int)({{arg_dst}}_w * {{arg_dst}}_h); + }) } //------------------ END KERNEL {{meta_kernel_id}} --------------------- )_"; diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp index 6c4b8f52f2..bffb467ebb 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp @@ -46,31 +46,14 @@ std::string ClTemplateStore::get_name() const std::string ClTemplateStore::get_component_code(const ComponentGroup &comp_group) const { ARM_COMPUTE_UNUSED(comp_group); + return R"_( //------------------ START KERNEL {{meta_kernel_id}} STORE --------------------- { -// This also follows NHWC layout -// g_ind_0 maps to global_id(0) maps to Channel -// g_ind_1 maps to global_id(1) maps to Height and Weight (Collapsed Window) -// g_ind_2 maps to global_id(2) maps to N / Batch -#define _IDST_WIDTH {{dst}}_w -#define _IDST_HEIGHT {{dst}}_h - TILE(uint, M0, 1, dst_indirect_y); - - // Calculate the destination indirect Y - LOOP_UNROLLING(int, i, 0, 1, M0, - { - dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1); - dst_indirect_y[i].v += g_ind_2 * (int)(_IDST_WIDTH * _IDST_HEIGHT); - }) - bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; - T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, g_ind_0, {{dst}}_stride_y, x_cond, {{src}}, dst_indirect_y); - -#undef _IDST_WIDTH -#undef _IDST_HEIGHT - //------------------ END KERNEL {{meta_kernel_id}} STORE --------------------- + T_STORE_INDIRECT_WIDTH_SELECT({{DST_DATA_TYPE}}, M0, N0, PARTIAL_N0, {{DST_TENSOR_TYPE}}, {{dst}}, g_ind_0, {{dst}}_stride_y, x_cond, {{src}}, g_dst_indirect_y); +//------------------ END KERNEL {{meta_kernel_id}} STORE --------------------- } )_"; -- cgit v1.2.1