COMPMID-935 - Implementing Convolution with Winograd on OpenCL (Part 1)

This patch enables GEMM to execute multiple batches in parallel https://confluence.arm.com/display/MLENG/Winograd%3A+batched+GEMM Change-Id: I66222db041dd35e82af11fbb262fd1ebd3ca4b2f Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/120866 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
author: Gian Marco <gianmarco.iodice@arm.com> 2018-02-15 12:35:44 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:48:33 +0000
commit: ae2af74ae4368004221a41e6891e0173453996ac (patch)
tree: a9d16fd683ee45e1caf071c0175c9d61cb99fdc3 /src/core/CL/kernels
parent: d56e770e7c394d13706a21ee350e7dafe4278987 (diff)
download: ComputeLibrary-ae2af74ae4368004221a41e6891e0173453996ac.tar.gz
3 files changed, 49 insertions, 17 deletions
diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
index 241dd8549d..d12255ff24 100644
--- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
@@ -80,8 +80,12 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         output_access.set_valid_region(win, input->valid_region());
     }
 
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window collapsed = win.collapse(win, Window::DimZ);
+
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    return std::make_pair(err, collapsed);
 }
 } // namespace
 
@@ -136,6 +140,10 @@ void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *out
     _config_id += support::cpp11::to_string(output->info()->dimension(0));
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(3));
 }
 
 Status CLGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height)
@@ -160,15 +168,14 @@ void CLGEMMInterleave4x4Kernel::run(const Window &window, cl::CommandQueue &queu
      *
      * After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ]
      */
-    Window in_slice  = window.first_slice_window_2D();
-    Window out_slice = window.first_slice_window_2D();
+    Window slice = window.first_slice_window_3D();
 
     do
     {
         unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, in_slice);
-        add_2D_tensor_argument(idx, _output, out_slice);
-        enqueue(queue, *this, in_slice, _lws_hint);
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice, _lws_hint);
     }
-    while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+    while(window.slide_window_slice_3D(slice));
 }
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 3143075a9d..6655d12d7e 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -158,8 +158,17 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
         output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
     }
 
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window collapsed = win;
+    if(input1->num_dimensions() > 1)
+    {
+        const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(input1->num_dimensions() - 1), 2u);
+        collapsed                                = win.collapse(win, dimension_to_collapse);
+    }
+
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    return std::make_pair(err, collapsed);
 }
 } // namespace
 
@@ -286,6 +295,10 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(0));
     _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(3));
+    _config_id += "_";
     _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
 }
 
@@ -312,7 +325,13 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    Window slice          = window.first_slice_window_2D();
+    if(_input1->info()->num_dimensions() < 3)
+    {
+        // The stride_z for matrix B must be zero if we do not slice
+        ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
+    }
+
+    Window slice          = window.first_slice_window_3D();
     Window slice_matrix_b = slice;
 
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -322,7 +341,7 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que
     {
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
         if(_input1->info()->num_dimensions() < 3)
         {
             slice_b = slice_matrix_b;
@@ -332,7 +351,10 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que
         add_2D_tensor_argument(idx, _input0, slice);
         add_2D_tensor_argument(idx, _input1, slice_b);
         add_2D_tensor_argument(idx, _output, slice);
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[3]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[3]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[3]));
         enqueue(queue, *this, slice, _lws_hint);
     }
-    while(window.slide_window_slice_2D(slice));
+    while(window.slide_window_slice_3D(slice));
 }
diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
index 24d218760e..5489fde818 100644
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
@@ -86,8 +86,11 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->tensor_shape()));
     }
 
+    // Collapse along the Z direction
+    Window collapsed = win.collapse(win, Window::DimZ);
+
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    return std::make_pair(err, collapsed);
 }
 } // namespace
 
@@ -151,15 +154,15 @@ void CLGEMMTranspose1xWKernel::run(const Window &window, cl::CommandQueue &queue
     out_window.set(Window::DimX, window.y());
     out_window.set(Window::DimY, window.x());
 
-    Window in_slice  = window.first_slice_window_2D();
-    Window out_slice = out_window.first_slice_window_2D();
+    Window in_slice  = window.first_slice_window_3D();
+    Window out_slice = out_window.first_slice_window_3D();
 
     do
     {
         unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, in_slice);
-        add_2D_tensor_argument(idx, _output, out_slice);
+        add_3D_tensor_argument(idx, _input, in_slice);
+        add_3D_tensor_argument(idx, _output, out_slice);
         enqueue(queue, *this, in_slice, _lws_hint);
     }
-    while(window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+    while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_3D(out_slice));
 }
author	Gian Marco <gianmarco.iodice@arm.com>	2018-02-15 12:35:44 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:48:33 +0000
commit	ae2af74ae4368004221a41e6891e0173453996ac (patch)
tree	a9d16fd683ee45e1caf071c0175c9d61cb99fdc3 /src/core/CL/kernels
parent	d56e770e7c394d13706a21ee350e7dafe4278987 (diff)
download	ComputeLibrary-ae2af74ae4368004221a41e6891e0173453996ac.tar.gz