1 files changed, 40 insertions, 74 deletions
diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
index be4c8ef5b7..c4b70ca82b 100644
--- a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
@@ -23,11 +23,11 @@
  */
 #include "src/gpu/cl/kernels/ClDirectConv2dKernel.h"
 
-#include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
@@ -40,6 +40,7 @@
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 #include "support/Cast.h"
 #include "support/StringSupport.h"
+
 namespace arm_compute
 {
 namespace opencl
@@ -49,7 +50,7 @@ namespace kernels
 namespace
 {
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                          const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+                          const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -83,6 +84,21 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
         }
     }
 
+    if(data_layout == DataLayout::NHWC)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && desc.n0 != 16,
+                                        "N0 can only be: 1, 2, 3, 4, 8, and 16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
+                                        "K0 can only be: 1, 2, 3, 4, 8, and 16");
+        if(desc.export_weights_to_cl_image)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
+                                            "K0 can only be: 4, 8, and 16");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_weights_to_cl_image(weights),
+                                            "Export to CLImage is not supported for this weight configuration");
+        }
+    }
+
     if(biases != nullptr)
     {
         if(is_data_type_quantized_asymmetric(src->data_type()))
@@ -121,50 +137,6 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co
     }
     return Status{};
 }
-
-bool export_to_cl_image_support(ITensorInfo *tensor, GPUTarget gpu_target, DataLayout data_layout)
-{
-    if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC))
-    {
-        return false;
-    }
-
-    // If not floating point
-    if(!is_data_type_float(tensor->data_type()))
-    {
-        return false;
-    }
-
-    if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
-    {
-        return false;
-    }
-
-    // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform
-    if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
-    {
-        return false;
-    }
-
-    // Check cl image pitch alignment
-    if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0)
-    {
-        return false;
-    }
-
-    const size_t image_w     = tensor->tensor_shape()[0] / 4;
-    const size_t image_h     = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3];
-    const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
-    const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
-
-    if(image_w > max_image_w || image_h > max_image_h)
-    {
-        return false;
-    }
-
-    return true;
-}
-
 } // namespace
 
 ClDirectConv2dKernel::ClDirectConv2dKernel()
@@ -173,12 +145,12 @@ ClDirectConv2dKernel::ClDirectConv2dKernel()
 }
 
 void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                     const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+                                     const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
 
     // Perform validation
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info, desc));
 
     const int conv_stride_x = std::get<0>(conv_info.stride());
     const int conv_stride_y = std::get<1>(conv_info.stride());
@@ -208,15 +180,12 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
     Window win;
     if(_data_layout == DataLayout::NHWC)
     {
-        const unsigned int vec_size = std::min(static_cast<unsigned int>(dst->tensor_shape()[0]), 4u);
-        unsigned int       num_rows = 1U;
-        if(dst->tensor_shape()[0] > 16)
-        {
-            num_rows = src->data_type() == DataType::F32 ? 2U : 4U;
-        }
+        output_shape.collapse(2U, 1U);
+        const unsigned int n0 = adjust_vec_size(desc.n0, output_shape[0]);
+        const unsigned int m0 = adjust_vec_size(desc.m0, output_shape[1]);
 
         // Create window and update padding
-        win = calculate_max_window(output_shape, Steps(vec_size, num_rows));
+        win = calculate_max_window(output_shape, Steps(n0, m0));
     }
     else if(_data_layout == DataLayout::NCHW)
     {
@@ -233,16 +202,17 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
     {
         kernel_name << "direct_convolution_nhwc";
 
-        const unsigned int n0                 = win.x().step();
-        const unsigned int m0                 = win.y().step();
-        const unsigned int k0                 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src->dimension(channel_idx));
-        const unsigned int partial_store_n0   = dst->dimension(channel_idx) % n0;
-        const unsigned int pad_left           = conv_info.pad_left();
-        const unsigned int pad_top            = conv_info.pad_top();
-        const bool         export_to_cl_image = export_to_cl_image_support(weights, gpu_target, _data_layout);
+        const unsigned int n0               = win.x().step();
+        const unsigned int m0               = win.y().step();
+        const unsigned int k0               = adjust_vec_size(desc.k0, src->dimension(channel_idx));
+        const unsigned int partial_store_n0 = dst->dimension(channel_idx) % n0;
+        const unsigned int pad_left         = conv_info.pad_left();
+        const unsigned int pad_top          = conv_info.pad_top();
+
+        _export_to_cl_image = desc.export_weights_to_cl_image;
 
         // Update the padding for the weights tensor if we can export to cl_image
-        if(export_to_cl_image)
+        if(_export_to_cl_image)
         {
             gemm::update_padding_for_cl_image(weights);
         }
@@ -274,7 +244,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
         build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
         build_options.add_option("-DDST_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type));
-        build_options.add_option_if_else(export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
+        build_options.add_option_if_else(_export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx)));
         build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx)));
         build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type()));
@@ -325,6 +295,8 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
     }
     else
     {
+        _export_to_cl_image = false;
+
         kernel_name << "direct_convolution_nchw";
         build_options.add_option_if(biases != nullptr, std::string("-DHAS_BIAS"));
         build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(width_idx)));
@@ -393,9 +365,9 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT
 }
 
 Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                      const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+                                      const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, desc));
     return Status{};
 }
 
@@ -416,13 +388,7 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl
     {
         cl::Image2D weights_cl_image;
 
-        const size_t dim_y_collapsed    = ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2), slice.y().step());
-        const bool   export_to_cl_image = export_to_cl_image_support(weights->info(), get_target(), _data_layout);
-
-        slice.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, slice.y().step()));
-        slice.set(Window::DimZ, Window::Dimension(0, dst->info()->dimension(3), 1));
-
-        if(export_to_cl_image)
+        if(_export_to_cl_image)
         {
             const size_t      image_w = weights->info()->dimension(0) / 4;
             const size_t      image_h = weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3);
@@ -436,7 +402,7 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl
         unsigned int idx = 0;
         add_4d_tensor_nhwc_argument(idx, src);
         add_4d_tensor_nhwc_argument(idx, dst);
-        if(export_to_cl_image)
+        if(_export_to_cl_image)
         {
             _kernel.setArg(idx++, weights_cl_image);
         }