diff options
author | Ramy Elgammal <ramy.elgammal@arm.com> | 2022-11-30 16:23:10 +0000 |
---|---|---|
committer | Ramy Elgammal <ramy.elgammal@arm.com> | 2022-12-09 13:57:49 +0000 |
commit | df6a3b05842a98702437347ca269138ccd55f852 (patch) | |
tree | d38b3cc83acfa0aa492b953b6a3c06104e0d76fc /src/dynamic_fusion/sketch/gpu | |
parent | 86689cdd95f634fb374f3875f62a4cb3408e1699 (diff) | |
download | ComputeLibrary-df6a3b05842a98702437347ca269138ccd55f852.tar.gz |
Use heuristics for setting dynamic fusion direct conv2d tile sizes
Resolves: COMPMID-5735
Change-Id: I9958413b69c5052cfa205dd0e9457cc4953aaf35
Signed-off-by: Ramy Elgammal <ramy.elgammal@arm.com>
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/474818
Tested-by: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Comments-Addressed: bsgcomp <bsgcomp@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8724
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/dynamic_fusion/sketch/gpu')
4 files changed, 63 insertions, 18 deletions
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp index dc05825500..1fbcb41028 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp @@ -57,13 +57,24 @@ bool ClComponentDirectConv2dSettings::fast_relaxed_math() const return _fast_relaxed_math; } +ClComponentDirectConv2dSettings &ClComponentDirectConv2dSettings::direct_conv_descriptor(const DirectConvComputeKernelInfo &desc) +{ + _desc = desc; + return *this; +} + +DirectConvComputeKernelInfo ClComponentDirectConv2dSettings::direct_conv_descriptor() const +{ + return _desc; +} + Status ClComponentDirectConv2d::validate( const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes, const Settings &settings) { - ARM_COMPUTE_UNUSED(properties, settings); + ARM_COMPUTE_UNUSED(properties); const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0); const auto wei = tensors.get_const_tensor(TensorType::ACL_SRC_1); const auto bia = tensors.get_const_tensor(TensorType::ACL_SRC_2); @@ -125,6 +136,11 @@ Status ClComponentDirectConv2d::validate( // Data layout ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); + const auto desc = settings.direct_conv_descriptor(); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && desc.n0 != 16, + "N0 can only be: 1, 2, 3, 4, 8, and 16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16, + "K0 can only be: 1, 2, 3, 4, 8, and 16"); return Status{}; } diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h index fec22b84a5..c3a70ef3ae 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h @@ -25,6 +25,7 @@ #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDIRECTCONV2D #include "arm_compute/core/Error.h" +#include "arm_compute/core/KernelDescriptors.h" #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h" #include <memory> @@ -56,9 +57,15 @@ public: /** Get fast_relaxed_math flag */ bool fast_relaxed_math() const; + /** Set direct convolution descriptor */ + ClComponentDirectConv2dSettings &direct_conv_descriptor(const DirectConvComputeKernelInfo &desc); + /** Get direct convolution descriptor */ + DirectConvComputeKernelInfo direct_conv_descriptor() const; + private: - bool _export_to_cl_image{ false }; - bool _fast_relaxed_math{ true }; + bool _export_to_cl_image{ false }; + bool _fast_relaxed_math{ true }; + DirectConvComputeKernelInfo _desc{}; // Direct convolution descriptor }; /** Forward declaration */ diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp index 9cb4ee7815..048ee01f35 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp @@ -23,16 +23,19 @@ */ #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h" +#include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/dynamic_fusion/sketch/ArgumentPack.h" #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h" #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h" #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" - -#include "src/common/utils/Log.h" +#include "src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h" +#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h" namespace arm_compute { @@ -85,6 +88,16 @@ bool export_to_cl_image_support(const ITensorInfo *tensor, GPUTarget gpu_target, return true; } +DirectConvComputeKernelInfo config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info) +{ + // Get GPU target + GPUTarget gpu_target = CLScheduler::get().target(); + + std::unique_ptr<arm_compute::cl_direct_conv::IClDirectConvKernelConfig> t = arm_compute::cl_direct_conv::ClDirectConvKernelConfigurationFactory::create(gpu_target); + + return t->configure(src, weights, conv_info); +} + constexpr GpuOperatorType operator_type = GpuOperatorType::Complex; } // namespace @@ -112,6 +125,11 @@ Status GpuConv2d::validate_op(const GpuWorkloadSketch &sketch, attributes.pad().right, attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType + // Checks performed when dst is configured + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), shape); + } auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(shape)); } @@ -175,6 +193,12 @@ void GpuConv2d::create_op(GpuWorkloadSketch &sketch, const Conv2dAttributes &attributes) { ARM_COMPUTE_LOG_PARAMS(src, wei, bia, dst, attributes); + PadStrideInfo conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, + attributes.pad().right, + attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR); + // Initialize the direct convolution descriptor + const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, wei, conv_info); + // Assert validation ARM_COMPUTE_ERROR_THROW_ON(GpuConv2d::validate_op(sketch, src, wei, bia, dst, attributes)); ARM_COMPUTE_ERROR_ON_NULLPTR(src, wei, dst); @@ -182,10 +206,7 @@ void GpuConv2d::create_op(GpuWorkloadSketch &sketch, // Auto initialize dst tensor { - auto shape = misc::shape_calculator::compute_deep_convolution_shape(src->tensor_shape(), data_layout, wei->tensor_shape(), - PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, - attributes.pad().right, - attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType + auto shape = misc::shape_calculator::compute_deep_convolution_shape(src->tensor_shape(), data_layout, wei->tensor_shape(), conv_info); // use the default DimensionRoundingType auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape)); } @@ -221,6 +242,8 @@ void GpuConv2d::create_op(GpuWorkloadSketch &sketch, arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei); } + settings.direct_conv_descriptor(desc); + ArgumentPack<ITensorInfo> arguments; arguments.add_const_tensor(ACL_SRC_0, src); arguments.add_const_tensor(ACL_SRC_1, wei); diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp index 75e812af9f..6f7bf72df8 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp @@ -69,7 +69,7 @@ std::string ClTemplateDirectConv2d::get_component_code(const ComponentGroup &com ARM_COMPUTE_UNUSED(comp_group); const auto channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL); - const auto k0 = adjust_vec_size(is_data_type_quantized(_src->data_type()) ? 16u : 8u, _src->dimension(channel_idx)); + const auto k0 = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx)); const bool leftover_loop = (_src->dimension(channel_idx) % k0) != 0; std::string code = R"_( @@ -303,13 +303,11 @@ TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, CLBuildOptions ClTemplateDirectConv2d::get_build_options(const ComponentGroup &comp_group) const { const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL); - const DataType data_type = _src->data_type(); - /// NOTE: For now tile sizes (n0, m0, k0) are set by the execution window. This may change in the future const auto root_window = comp_group.get_root_component()->template_writer()->get_window(); const unsigned int n0 = root_window.x().step(); const unsigned int m0 = root_window.y().step(); - const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, _src->dimension(channel_idx)); + const unsigned int k0 = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx)); const unsigned int partial_store_n0 = _dst->dimension(0) % n0; CLBuildOptions build_opts{}; @@ -369,15 +367,16 @@ Window ClTemplateDirectConv2d::get_window() const ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized"); const auto output_shape = _dst->tensor_shape(); + const auto desc = _settings.direct_conv_descriptor(); - const unsigned int vec_size = std::min(static_cast<unsigned int>(output_shape[0]), 4u); - const unsigned int num_rows = (_dst->tensor_shape()[0] > 16) ? ((_src->data_type() == DataType::F32) ? 2U : 4U) : 1U; + const unsigned int n0 = adjust_vec_size(desc.n0, output_shape[0]); + const unsigned int m0 = adjust_vec_size(desc.m0, output_shape[1] * output_shape[2]); // Create and configure kernel window - Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows)); + Window win = calculate_max_window(output_shape, Steps(n0, m0)); - const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], num_rows); - win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, num_rows)); + const size_t dim_y_collapsed = ceil_to_multiple(output_shape[1] * output_shape[2], m0); + win.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, m0)); win.set(Window::DimZ, Window::Dimension(0, output_shape.total_size_upper(3), 1)); return win; |