From e2e6d745c940cdfd8c3340fd1227dbef1badfb3c Mon Sep 17 00:00:00 2001 From: Viet-Hoa Do Date: Wed, 1 Mar 2023 15:46:10 +0000 Subject: Fix direct conv2d in dynamic fusion * Put input and output tensor shape value directly to the CL code. * Use texture for weights when it is possible. Resolves: COMPMID-5938 Signed-off-by: Viet-Hoa Do Change-Id: Ib53b310a80ce857eac36564b352136fdde55b131 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9249 Reviewed-by: SiCong Li Reviewed-by: Jakub Sujak Benchmark: Arm Jenkins Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- .../gpu/components/cl/ClComponentDirectConv2d.cpp | 8 +--- .../gpu/components/cl/ClComponentDirectConv2d.h | 5 +- .../sketch/gpu/operators/GpuConv2d.cpp | 55 +--------------------- .../template_writer/cl/ClTemplateDirectConv2d.cpp | 26 ++++++---- 4 files changed, 22 insertions(+), 72 deletions(-) diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp index c8e682f34a..3965deced1 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp @@ -35,15 +35,9 @@ namespace experimental { namespace dynamic_fusion { -ClComponentDirectConv2dSettings &ClComponentDirectConv2dSettings::export_to_cl_image(bool cl_image) -{ - _export_to_cl_image = cl_image; - return *this; -} - bool ClComponentDirectConv2dSettings::export_to_cl_image() const { - return _export_to_cl_image; + return _desc.export_weights_to_cl_image; } ClComponentDirectConv2dSettings &ClComponentDirectConv2dSettings::fast_relaxed_math(bool fast_relaxed_math) diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h index c3a70ef3ae..8e555dce57 100644 --- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h +++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -47,8 +47,6 @@ class Conv2dAttributes; class ClComponentDirectConv2dSettings { public: - /** Set export_to_cl_image flag */ - ClComponentDirectConv2dSettings &export_to_cl_image(bool cl_image); /** Get export_to_cl_image flag */ bool export_to_cl_image() const; @@ -63,7 +61,6 @@ public: DirectConvComputeKernelInfo direct_conv_descriptor() const; private: - bool _export_to_cl_image{ false }; bool _fast_relaxed_math{ true }; DirectConvComputeKernelInfo _desc{}; // Direct convolution descriptor }; diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp index 690371f910..e00f09563f 100644 --- a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp @@ -45,49 +45,6 @@ namespace dynamic_fusion { namespace { -bool export_to_cl_image_support(const ITensorInfo *tensor, GPUTarget gpu_target, const cl::Device &device, DataLayout data_layout) -{ - if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC)) - { - return false; - } - - // If not floating point - if(!is_data_type_float(tensor->data_type())) - { - return false; - } - - if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD) - { - return false; - } - - // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform - if(!image2d_from_buffer_supported(device)) - { - return false; - } - - // Check cl image pitch alignment - if(get_cl_image_pitch_alignment(device) == 0) - { - return false; - } - - const size_t image_w = tensor->tensor_shape()[0] / 4; - const size_t image_h = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3]; - const size_t max_image_w = device.getInfo(); - const size_t max_image_h = device.getInfo(); - - if(image_w > max_image_w || image_h > max_image_h) - { - return false; - } - - return true; -} - DirectConvComputeKernelInfo config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info) { // Get GPU target @@ -126,7 +83,6 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, TensorInfo dst_info_to_validate; const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate; - const DataLayout data_layout = src->data_layout(); if(dst != nullptr) { dst_info_to_validate_ptr = dst; @@ -151,9 +107,6 @@ Status is_supported_op_helper(const GpuWorkloadContext &context, const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run }); auto settings = ClComponentDirectConv2d::Settings(); - settings.export_to_cl_image( - export_to_cl_image_support(src, gpu_target, cl_compile_ctx->get_device(), data_layout)); - settings.fast_relaxed_math( (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) && (dst_info_to_validate_ptr->data_type() == DataType::F32 || dst_info_to_validate_ptr->data_type() == DataType::F16)); @@ -251,7 +204,6 @@ ITensorInfo *GpuConv2d::create_op(GpuWorkloadSketch &sketch, const auto sketch_ctx = sketch.implementation().context(); - const auto data_layout = src->data_layout(); const auto gpu_target = sketch_ctx->gpu_target(); if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL) @@ -266,20 +218,17 @@ ITensorInfo *GpuConv2d::create_op(GpuWorkloadSketch &sketch, auto settings = ClComponentDirectConv2d::Settings(); - settings.export_to_cl_image( - export_to_cl_image_support(src, gpu_target, cl_compile_ctx->get_device(), data_layout)); - settings.fast_relaxed_math( (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) && (dst->data_type() == DataType::F32 || dst->data_type() == DataType::F16)); + settings.direct_conv_descriptor(desc); + if(settings.export_to_cl_image()) { arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei); } - settings.direct_conv_descriptor(desc); - ArgumentPack arguments; arguments.add_const_tensor(ACL_SRC_0, src); arguments.add_const_tensor(ACL_SRC_1, wei); diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp index e69103e263..ca531fe28e 100644 --- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp +++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp @@ -91,12 +91,12 @@ TILE(uint, M0, 1, g_dst_indirect_y); { #define _IWEI_WIDTH {{WEI_WIDTH}} #define _IWEI_HEIGHT {{WEI_HEIGHT}} -#define _ISRC_WIDTH {{src}}_w -#define _ISRC_HEIGHT {{src}}_h -#define _ISRC_CHANNELS {{src}}_c -#define _IDST_WIDTH {{arg_dst}}_w -#define _IDST_HEIGHT {{arg_dst}}_h -#define _IDST_CHANNELS {{arg_dst}}_c +#define _ISRC_WIDTH {{SRC_WIDTH}} +#define _ISRC_HEIGHT {{SRC_HEIGHT}} +#define _ISRC_CHANNELS {{SRC_CHANNELS}} +#define _IDST_WIDTH {{DST_WIDTH}} +#define _IDST_HEIGHT {{DST_HEIGHT}} +#define _IDST_CHANNELS {{DST_CHANNELS}} #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT) TILE(int, M0, 1, xi); @@ -214,8 +214,8 @@ code += R"_( code += R"_( LOOP_UNROLLING(int, i, 0, 1, M0, { - g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{arg_dst}}_w * {{arg_dst}}_h) - 1); - g_dst_indirect_y[i].v += g_ind_2 * (int)({{arg_dst}}_w * {{arg_dst}}_h); + g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{DST_WIDTH}} * {{DST_HEIGHT}}) - 1); + g_dst_indirect_y[i].v += g_ind_2 * (int)({{DST_WIDTH}} * {{DST_HEIGHT}}); }) } //------------------ END KERNEL {{meta_kernel_id}} --------------------- @@ -294,9 +294,19 @@ TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, } const auto width_idx = 1; const auto height_idx = 2; + const auto channel_idx = 0; + + lut["SRC_WIDTH"] = _src->dimension(width_idx); + lut["SRC_HEIGHT"] = _src->dimension(height_idx); + lut["SRC_CHANNELS"] = _src->dimension(channel_idx); + lut["WEI_WIDTH"] = _weight->dimension(width_idx); lut["WEI_HEIGHT"] = _weight->dimension(height_idx); + lut["DST_WIDTH"] = _dst->dimension(width_idx); + lut["DST_HEIGHT"] = _dst->dimension(height_idx); + lut["DST_CHANNELS"] = _dst->dimension(channel_idx); + lut["STRIDE_X"] = _attributes.stride().x(); lut["STRIDE_Y"] = _attributes.stride().y(); -- cgit v1.2.1