Fix direct conv2d in dynamic fusion

* Put input and output tensor shape value directly to the CL code. * Use texture for weights when it is possible. Resolves: COMPMID-5938 Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com> Change-Id: Ib53b310a80ce857eac36564b352136fdde55b131 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9249 Reviewed-by: SiCong Li <sicong.li@arm.com> Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Viet-Hoa Do <viet-hoa.do@arm.com> 2023-03-01 15:46:10 +0000
committer: Viet-Hoa Do <viet-hoa.do@arm.com> 2023-03-02 09:46:28 +0000
commit: e2e6d745c940cdfd8c3340fd1227dbef1badfb3c (patch)
tree: 9af6bc878aa397025524d2eca303e0d4a5a1693a
parent: bbf2e7477be984702e1a51f2a23910ee8349b867 (diff)
download: ComputeLibrary-e2e6d745c940cdfd8c3340fd1227dbef1badfb3c.tar.gz
4 files changed, 22 insertions, 72 deletions
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
index c8e682f34a..3965deced1 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
@@ -35,15 +35,9 @@ namespace experimental
 {
 namespace dynamic_fusion
 {
-ClComponentDirectConv2dSettings &ClComponentDirectConv2dSettings::export_to_cl_image(bool cl_image)
-{
-    _export_to_cl_image = cl_image;
-    return *this;
-}
-
 bool ClComponentDirectConv2dSettings::export_to_cl_image() const
 {
-    return _export_to_cl_image;
+    return _desc.export_weights_to_cl_image;
 }
 
 ClComponentDirectConv2dSettings &ClComponentDirectConv2dSettings::fast_relaxed_math(bool fast_relaxed_math)
diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
index c3a70ef3ae..8e555dce57 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,8 +47,6 @@ class Conv2dAttributes;
 class ClComponentDirectConv2dSettings
 {
 public:
-    /** Set export_to_cl_image flag */
-    ClComponentDirectConv2dSettings &export_to_cl_image(bool cl_image);
     /** Get export_to_cl_image flag */
     bool export_to_cl_image() const;
 
@@ -63,7 +61,6 @@ public:
     DirectConvComputeKernelInfo direct_conv_descriptor() const;
 
 private:
-    bool                        _export_to_cl_image{ false };
     bool                        _fast_relaxed_math{ true };
     DirectConvComputeKernelInfo _desc{}; // Direct convolution descriptor
 };
diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
index 690371f910..e00f09563f 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
@@ -45,49 +45,6 @@ namespace dynamic_fusion
 {
 namespace
 {
-bool export_to_cl_image_support(const ITensorInfo *tensor, GPUTarget gpu_target, const cl::Device &device, DataLayout data_layout)
-{
-    if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC))
-    {
-        return false;
-    }
-
-    // If not floating point
-    if(!is_data_type_float(tensor->data_type()))
-    {
-        return false;
-    }
-
-    if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
-    {
-        return false;
-    }
-
-    // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform
-    if(!image2d_from_buffer_supported(device))
-    {
-        return false;
-    }
-
-    // Check cl image pitch alignment
-    if(get_cl_image_pitch_alignment(device) == 0)
-    {
-        return false;
-    }
-
-    const size_t image_w     = tensor->tensor_shape()[0] / 4;
-    const size_t image_h     = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3];
-    const size_t max_image_w = device.getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
-    const size_t max_image_h = device.getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
-
-    if(image_w > max_image_w || image_h > max_image_h)
-    {
-        return false;
-    }
-
-    return true;
-}
-
 DirectConvComputeKernelInfo config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
 {
     // Get GPU target
@@ -126,7 +83,6 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    const DataLayout data_layout = src->data_layout();
     if(dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
@@ -151,9 +107,6 @@ Status is_supported_op_helper(const GpuWorkloadContext &context,
             const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
             auto       settings   = ClComponentDirectConv2d::Settings();
 
-            settings.export_to_cl_image(
-                export_to_cl_image_support(src, gpu_target, cl_compile_ctx->get_device(), data_layout));
-
             settings.fast_relaxed_math(
                 (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
                 && (dst_info_to_validate_ptr->data_type() == DataType::F32 || dst_info_to_validate_ptr->data_type() == DataType::F16));
@@ -251,7 +204,6 @@ ITensorInfo *GpuConv2d::create_op(GpuWorkloadSketch      &sketch,
 
     const auto sketch_ctx = sketch.implementation().context();
 
-    const auto data_layout = src->data_layout();
     const auto gpu_target  = sketch_ctx->gpu_target();
 
     if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
@@ -266,20 +218,17 @@ ITensorInfo *GpuConv2d::create_op(GpuWorkloadSketch      &sketch,
 
             auto settings = ClComponentDirectConv2d::Settings();
 
-            settings.export_to_cl_image(
-                export_to_cl_image_support(src, gpu_target, cl_compile_ctx->get_device(), data_layout));
-
             settings.fast_relaxed_math(
                 (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
                 && (dst->data_type() == DataType::F32 || dst->data_type() == DataType::F16));
 
+            settings.direct_conv_descriptor(desc);
+
             if(settings.export_to_cl_image())
             {
                 arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei);
             }
 
-            settings.direct_conv_descriptor(desc);
-
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
             arguments.add_const_tensor(ACL_SRC_1, wei);
diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
index e69103e263..ca531fe28e 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
@@ -91,12 +91,12 @@ TILE(uint, M0, 1, g_dst_indirect_y);
 {
 #define _IWEI_WIDTH {{WEI_WIDTH}}
 #define _IWEI_HEIGHT {{WEI_HEIGHT}}
-#define _ISRC_WIDTH {{src}}_w
-#define _ISRC_HEIGHT {{src}}_h
-#define _ISRC_CHANNELS {{src}}_c
-#define _IDST_WIDTH {{arg_dst}}_w
-#define _IDST_HEIGHT {{arg_dst}}_h
-#define _IDST_CHANNELS {{arg_dst}}_c
+#define _ISRC_WIDTH {{SRC_WIDTH}}
+#define _ISRC_HEIGHT {{SRC_HEIGHT}}
+#define _ISRC_CHANNELS {{SRC_CHANNELS}}
+#define _IDST_WIDTH {{DST_WIDTH}}
+#define _IDST_HEIGHT {{DST_HEIGHT}}
+#define _IDST_CHANNELS {{DST_CHANNELS}}
 #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
 
     TILE(int, M0, 1, xi);
@@ -214,8 +214,8 @@ code += R"_(
 code += R"_(
     LOOP_UNROLLING(int, i, 0, 1, M0,
     {
-        g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{arg_dst}}_w * {{arg_dst}}_h) - 1);
-        g_dst_indirect_y[i].v += g_ind_2 * (int)({{arg_dst}}_w * {{arg_dst}}_h);
+        g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{DST_WIDTH}} * {{DST_HEIGHT}}) - 1);
+        g_dst_indirect_y[i].v += g_ind_2 * (int)({{DST_WIDTH}} * {{DST_HEIGHT}});
     })
 }
 //------------------ END KERNEL {{meta_kernel_id}} ---------------------
@@ -294,9 +294,19 @@ TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable,
     }
     const auto width_idx  = 1;
     const auto height_idx = 2;
+    const auto channel_idx = 0;
+
+    lut["SRC_WIDTH"] = _src->dimension(width_idx);
+    lut["SRC_HEIGHT"] = _src->dimension(height_idx);
+    lut["SRC_CHANNELS"] = _src->dimension(channel_idx);
+
     lut["WEI_WIDTH"]      = _weight->dimension(width_idx);
     lut["WEI_HEIGHT"]     = _weight->dimension(height_idx);
 
+    lut["DST_WIDTH"] = _dst->dimension(width_idx);
+    lut["DST_HEIGHT"] = _dst->dimension(height_idx);
+    lut["DST_CHANNELS"] = _dst->dimension(channel_idx);
+
     lut["STRIDE_X"] = _attributes.stride().x();
     lut["STRIDE_Y"] = _attributes.stride().y();
author	Viet-Hoa Do <viet-hoa.do@arm.com>	2023-03-01 15:46:10 +0000
committer	Viet-Hoa Do <viet-hoa.do@arm.com>	2023-03-02 09:46:28 +0000
commit	e2e6d745c940cdfd8c3340fd1227dbef1badfb3c (patch)
tree	9af6bc878aa397025524d2eca303e0d4a5a1693a
parent	bbf2e7477be984702e1a51f2a23910ee8349b867 (diff)
download	ComputeLibrary-e2e6d745c940cdfd8c3340fd1227dbef1badfb3c.tar.gz