From 3704464b68ddd22739b38354de1721a3db4267b5 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Tue, 30 Oct 2018 14:53:25 +0000 Subject: COMPMID-1703: Collapse the 4th dimensions in CLDepthWiseConvolutionLayer3x3Kernel Change-Id: Ie274da79b15c03f86dfedc85bb721b3de34a0bb4 --- .../CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp') diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp index 93d96dad1b..d76f5495f1 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp @@ -245,6 +245,8 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::configure(const ICLTensor *input, build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(conv_stride_x)); build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y)); } + build_opts.add_option_if(_input->info()->tensor_shape().total_size_upper(3) > 1, + "-DDST_DEPTH=" + support::cpp11::to_string(static_cast(std::ceil(_output->info()->dimension(2) / static_cast(_num_planes_processed_per_iteration))))); // Create kernel std::string kernel_name = std::string("depthwise_convolution_3x3") + (is_qasymm ? std::string("_quantized") + ((is_dot8_supported @@ -291,8 +293,12 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::run(const Window &window, cl::Com ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - Window win = window; - win.set(Window::DimZ, Window::Dimension(0, std::ceil(_output->info()->dimension(2) / static_cast(_num_planes_processed_per_iteration)), 1)); + // Collapse window + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + const size_t total_batches = _input->info()->tensor_shape().total_size_upper(3); + + Window win = window_collapsed; + win.set(Window::DimZ, Window::Dimension(0, std::ceil(_output->info()->dimension(2) / static_cast(_num_planes_processed_per_iteration)) * total_batches, 1)); // Create input window and adjust Window win_in = win; @@ -301,10 +307,10 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::run(const Window &window, cl::Com ARM_COMPUTE_ERROR_ON((win_in.y().step() < window.y().step()) || (win_in.z().step() < window.z().step())); - Window slice_in = win_in.first_slice_window_3D(); - Window slice_out = win.first_slice_window_3D(); + Window slice_in = win_in.first_slice_window_4D(); + Window slice_out = win.first_slice_window_4D(); - unsigned int idx = 3 * num_arguments_per_3D_tensor(); + unsigned int idx = 2 * num_arguments_per_4D_tensor() + num_arguments_per_3D_tensor(); if(_biases != nullptr) { @@ -321,11 +327,11 @@ void CLDepthwiseConvolutionLayer3x3NHWCKernel::run(const Window &window, cl::Com do { unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice_in); - add_3D_tensor_argument(idx, _output, slice_out); + add_4D_tensor_argument(idx, _input, slice_in); + add_4D_tensor_argument(idx, _output, slice_out); add_3D_tensor_argument(idx, _weights, slice_out); enqueue(queue, *this, slice_out, lws_hint()); } - while(window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in)); + while(win.slide_window_slice_4D(slice_out) && win_in.slide_window_slice_4D(slice_in)); } -- cgit v1.2.1