From 45091736a9276919ececee0cba106228246341f8 Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Mon, 13 May 2019 17:41:01 +0100
Subject: COMPMID-2184: Implement direct convolution 9x9 (NHWC) on OpenCL

Change-Id: I8aa929e7e72d2d1ccee07ee2ed9618c15084ae9d
Signed-off-by: giuros01 <giuseppe.rossini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1274
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
---
 .../CL/kernels/CLDirectConvolutionLayerKernel.cpp  | 54 +++++++++++++++++++---
 1 file changed, 48 insertions(+), 6 deletions(-)

(limited to 'src/core/CL/kernels')

diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index 3e158a52ff..b878a2121f 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -54,14 +54,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5,
-                                    "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
+                                    "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != input->dimension(channel_idx),
                                     "Weights feature map dimension should match the respective input's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution.");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5) && std::get<0>(conv_info.stride()) > 2,
                                     "Strides larger than 2 not supported for 3x3 convolution.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 9) && data_layout == DataLayout::NCHW, "Only NHWC layout is supported for 9x9 convolution.");
 
     if(biases != nullptr)
     {
@@ -103,6 +104,19 @@ inline bool can_run_optimized_kernel_for_bifrost(GPUTarget gpu_target, unsigned
            && (data_layout == DataLayout::NCHW);
 }
 
+inline bool can_run_optimized_kernel_for_bifrost_nhwc(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size,
+                                                      DataType data_type, DataLayout data_layout)
+{
+    return gpu_target_is_in(gpu_target,
+                            GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
+                            GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
+                            GPUTarget::G52, GPUTarget::G52LIT)
+           && (kernel_size == 9)
+           && (conv_stride_x == 1) && (conv_stride_y == 1)
+           && (data_type == DataType::F32)
+           && (data_layout == DataLayout::NHWC);
+}
+
 inline void setup_num_elems(unsigned int &num_elems_read_per_iteration_x, unsigned int &num_elems_read_per_iteration_y,
                             unsigned int &num_elems_written_per_iteration_x, unsigned int &num_elems_written_per_iteration_y,
                             unsigned int kernel_size, const PadStrideInfo &conv_info, const GPUTarget target, ITensorInfo *input)
@@ -149,7 +163,7 @@ inline void setup_num_elems(unsigned int &num_elems_read_per_iteration_x, unsign
             }
         }
     }
-    else
+    else if(data_layout == DataLayout::NCHW)
     {
         num_elems_read_per_iteration_y    = kernel_size;
         num_elems_written_per_iteration_x = 8;
@@ -215,11 +229,17 @@ inline void setup_num_elems(unsigned int &num_elems_read_per_iteration_x, unsign
                 ARM_COMPUTE_ERROR("Invalid direct convolution size");
         }
     }
-
-    if(data_layout == DataLayout::NHWC)
+    else // data_layout == NHWC
     {
+        const bool run_optimized_bifrost_nhwc = can_run_optimized_kernel_for_bifrost_nhwc(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout);
+
         num_elems_written_per_iteration_x = 1;
-        num_elems_read_per_iteration_x    = 1;
+
+        if(run_optimized_bifrost_nhwc)
+        {
+            num_elems_read_per_iteration_x = 4;
+        }
+
         switch(kernel_size)
         {
             case 1:
@@ -267,6 +287,21 @@ inline void setup_num_elems(unsigned int &num_elems_read_per_iteration_x, unsign
                         ARM_COMPUTE_ERROR("Invalid convolution stride X");
                 }
                 break;
+            case 9:
+                switch(conv_stride_x)
+                {
+                    case 1:
+                        num_elems_read_per_iteration_y    = 16;
+                        num_elems_written_per_iteration_y = 8;
+                        break;
+                    case 2:
+                        num_elems_read_per_iteration_y    = 24;
+                        num_elems_written_per_iteration_y = 8;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Invalid convolution stride X");
+                }
+                break;
             default:
                 ARM_COMPUTE_ERROR("Not implemented.");
                 break;
@@ -429,6 +464,7 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
         build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x)));
         if(data_layout == DataLayout::NHWC)
         {
+            const bool run_optimized_for_bifrost_nhwc = can_run_optimized_kernel_for_bifrost_nhwc(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, data_layout);
             build_options.add_option(std::string("-DDATA_LAYOUT_NHWC=1"));
             build_options.add_option(std::string("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(height_idx))));
             build_options.add_option(std::string("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(width_idx))));
@@ -437,6 +473,12 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
             build_options.add_option(std::string("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left())));
             build_options.add_option(std::string("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top())));
             build_options.add_option(std::string("-DSTRIDE_Y=" + support::cpp11::to_string(_conv_stride_y)));
+            if(run_optimized_for_bifrost_nhwc)
+            {
+                const unsigned int num_elems_read_per_iteration_x = 4;
+                _border_size.right                                = num_elems_read_per_iteration_x;
+                build_options.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_read_per_iteration_x));
+            }
         }
         build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type)));
         // Create kernel
-- 
cgit v1.2.1