diff options
Diffstat (limited to 'src/core/CL/kernels')
-rw-r--r-- | src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp | 21 | ||||
-rw-r--r-- | src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp | 30 | ||||
-rw-r--r-- | src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp | 15 |
3 files changed, 49 insertions, 17 deletions
diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp index 241dd8549d..d12255ff24 100644 --- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp +++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp @@ -80,8 +80,12 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen output_access.set_valid_region(win, input->valid_region()); } + // Collapse along the Z direction + // This collapse needs to be here in order to tune the Z dimension of LWS + Window collapsed = win.collapse(win, Window::DimZ); + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); + return std::make_pair(err, collapsed); } } // namespace @@ -136,6 +140,10 @@ void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *out _config_id += support::cpp11::to_string(output->info()->dimension(0)); _config_id += "_"; _config_id += support::cpp11::to_string(output->info()->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(output->info()->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(output->info()->dimension(3)); } Status CLGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height) @@ -160,15 +168,14 @@ void CLGEMMInterleave4x4Kernel::run(const Window &window, cl::CommandQueue &queu * * After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ] */ - Window in_slice = window.first_slice_window_2D(); - Window out_slice = window.first_slice_window_2D(); + Window slice = window.first_slice_window_3D(); do { unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, in_slice); - add_2D_tensor_argument(idx, _output, out_slice); - enqueue(queue, *this, in_slice, _lws_hint); + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice, _lws_hint); } - while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); + while(window.slide_window_slice_3D(slice)); } diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp index 3143075a9d..6655d12d7e 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp @@ -158,8 +158,17 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape())); } + // Collapse along the Z direction + // This collapse needs to be here in order to tune the Z dimension of LWS + Window collapsed = win; + if(input1->num_dimensions() > 1) + { + const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(input1->num_dimensions() - 1), 2u); + collapsed = win.collapse(win, dimension_to_collapse); + } + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); + return std::make_pair(err, collapsed); } } // namespace @@ -286,6 +295,10 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen _config_id += "_"; _config_id += support::cpp11::to_string(output->info()->dimension(0)); _config_id += "_"; + _config_id += support::cpp11::to_string(output->info()->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(output->info()->dimension(3)); + _config_id += "_"; _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1))); } @@ -312,7 +325,13 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - Window slice = window.first_slice_window_2D(); + if(_input1->info()->num_dimensions() < 3) + { + // The stride_z for matrix B must be zero if we do not slice + ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0); + } + + Window slice = window.first_slice_window_3D(); Window slice_matrix_b = slice; slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); @@ -322,7 +341,7 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que { Window slice_b = slice; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 - // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + // This scenario can happen when the matrix multiplication is used to perform a convolution operation if(_input1->info()->num_dimensions() < 3) { slice_b = slice_matrix_b; @@ -332,7 +351,10 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que add_2D_tensor_argument(idx, _input0, slice); add_2D_tensor_argument(idx, _input1, slice_b); add_2D_tensor_argument(idx, _output, slice); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[3])); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[3])); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[3])); enqueue(queue, *this, slice, _lws_hint); } - while(window.slide_window_slice_2D(slice)); + while(window.slide_window_slice_3D(slice)); } diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp index 24d218760e..5489fde818 100644 --- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp +++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp @@ -86,8 +86,11 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->tensor_shape())); } + // Collapse along the Z direction + Window collapsed = win.collapse(win, Window::DimZ); + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); + return std::make_pair(err, collapsed); } } // namespace @@ -151,15 +154,15 @@ void CLGEMMTranspose1xWKernel::run(const Window &window, cl::CommandQueue &queue out_window.set(Window::DimX, window.y()); out_window.set(Window::DimY, window.x()); - Window in_slice = window.first_slice_window_2D(); - Window out_slice = out_window.first_slice_window_2D(); + Window in_slice = window.first_slice_window_3D(); + Window out_slice = out_window.first_slice_window_3D(); do { unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, in_slice); - add_2D_tensor_argument(idx, _output, out_slice); + add_3D_tensor_argument(idx, _input, in_slice); + add_3D_tensor_argument(idx, _output, out_slice); enqueue(queue, *this, in_slice, _lws_hint); } - while(window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice)); + while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_3D(out_slice)); } |