aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/kernels/CLReductionOperationKernel.cpp
diff options
context:
space:
mode:
authorViet-Hoa Do <viet-hoa.do@arm.com>2023-10-09 10:58:35 +0100
committerViet-Hoa Do <viet-hoa.do@arm.com>2023-10-11 10:01:49 +0000
commitc210c85548c7f627690ed9259622d3ab342fe612 (patch)
tree6385edb5083a805bac8ddd83567a1e1dac0715ce /src/core/CL/kernels/CLReductionOperationKernel.cpp
parentfb9c25d27791d934300581596cce7c5875a79a80 (diff)
downloadComputeLibrary-c210c85548c7f627690ed9259622d3ab342fe612.tar.gz
Optimize CL reduction operation
* Batch dimension is added to reduction operation. - All the dimensions higher than the batch dimension are collapsed so that the input and output tensors are always 3-4D. - CL kernel is called once instead of being repeatedly called to process each sliding window. Resolves: COMPMID-6443 Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com> Change-Id: Icd99939d52d3bb648f08537e5f52ef27e894061b Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10456 Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/CL/kernels/CLReductionOperationKernel.cpp')
-rw-r--r--src/core/CL/kernels/CLReductionOperationKernel.cpp124
1 files changed, 81 insertions, 43 deletions
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 70875a2d40..c8665f8fbd 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -204,9 +204,10 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte
_kernel = create_kernel(compile_context, "reduction_operation_" + kernel_axis_name, build_opts.options());
// Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps(vec_size));
- win.set(Window::DimX,
- Window::Dimension(win.x().start(), win.x().end() * _input->info()->num_channels(), win.x().step()));
+ TensorShape actual_input_shape = input->info()->tensor_shape();
+ actual_input_shape[0] = width;
+
+ Window win = calculate_max_window(actual_input_shape, Steps(vec_size));
ICLKernel::configure_internal(win);
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
@@ -272,55 +273,92 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
break;
case 1:
{
- // Get first input and output slices
- Window window_in{window};
- window_in.set(Window::DimY,
- Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
- Window in_slice = window_in.first_slice_window_2D();
- Window out_slice = window.first_slice_window_2D();
-
- do
- {
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input, in_slice);
- add_2D_tensor_argument(idx, _output, out_slice);
- enqueue(queue, *this, in_slice);
- } while (window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+ bool has_collapsed = true;
+ Window actual_window = window.collapse_if_possible(window, 2, &has_collapsed);
+ ARM_COMPUTE_ERROR_ON(!has_collapsed);
+
+ actual_window = actual_window.shift_dimensions(1, Window::DimY);
+
+ const ITensorInfo *input_info = _input->info();
+ const Strides &input_strides = input_info->strides_in_bytes();
+
+ const ITensorInfo *output_info = _output->info();
+ const Strides &output_strides = output_info->strides_in_bytes();
+
+ unsigned int idx = 0;
+
+ _kernel.setArg(idx++, _input->cl_buffer());
+ _kernel.setArg<cl_uint>(idx++, input_strides[1]);
+ _kernel.setArg<cl_uint>(idx++, input_strides[2]);
+ _kernel.setArg<cl_uint>(idx++, input_info->offset_first_element_in_bytes());
+
+ _kernel.setArg(idx++, _output->cl_buffer());
+ _kernel.setArg<cl_uint>(idx++, output_strides[2]);
+ _kernel.setArg<cl_uint>(idx++, output_info->offset_first_element_in_bytes());
+
+ enqueue(queue, *this, actual_window);
}
break;
case 2:
{
- // Get first input and output slices
- Window window_in{window};
- window_in.set(Window::DimZ,
- Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
- Window in_slice = window_in.first_slice_window_3D();
- Window out_slice = window.first_slice_window_3D();
-
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, in_slice);
- add_3D_tensor_argument(idx, _output, out_slice);
- enqueue(queue, *this, in_slice);
- } while (window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+ bool has_collapsed = true;
+ Window actual_window = window.collapse_if_possible(window, 3, &has_collapsed);
+ ARM_COMPUTE_ERROR_ON(!has_collapsed);
+
+ actual_window = actual_window.shift_dimensions(1, Window::DimZ);
+
+ const ITensorInfo *input_info = _input->info();
+ const Strides &input_strides = input_info->strides_in_bytes();
+
+ const ITensorInfo *output_info = _output->info();
+ const Strides &output_strides = output_info->strides_in_bytes();
+
+ unsigned int idx = 0;
+
+ _kernel.setArg(idx++, _input->cl_buffer());
+ _kernel.setArg<cl_uint>(idx++, input_strides[1]);
+ _kernel.setArg<cl_uint>(idx++, input_strides[2]);
+ _kernel.setArg<cl_uint>(idx++, input_strides[3]);
+ _kernel.setArg<cl_uint>(idx++, input_info->offset_first_element_in_bytes());
+
+ _kernel.setArg(idx++, _output->cl_buffer());
+ _kernel.setArg<cl_uint>(idx++, output_strides[1]);
+ _kernel.setArg<cl_uint>(idx++, output_strides[3]);
+ _kernel.setArg<cl_uint>(idx++, output_info->offset_first_element_in_bytes());
+
+ enqueue(queue, *this, actual_window);
}
break;
case 3:
{
- // Get first input and output slices
- Window window_in{window};
- window_in.set(3, Window::Dimension(0, 1, 1));
- Window in_slice = window_in.first_slice_window_4D();
- Window out_slice = window.first_slice_window_4D();
+ bool has_collapsed = true;
+ Window actual_window = window.shift_dimensions(1, Window::DimW);
- do
- {
- unsigned int idx = 0;
- add_4D_tensor_argument(idx, _input, in_slice);
- add_4D_tensor_argument(idx, _output, out_slice);
- enqueue(queue, *this, in_slice);
- } while (window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
+ actual_window = actual_window.collapse_if_possible(actual_window, 2, &has_collapsed);
+ ARM_COMPUTE_ERROR_ON(!has_collapsed);
+
+ const ITensorInfo *input_info = _input->info();
+ const Strides &input_strides = input_info->strides_in_bytes();
+
+ const ITensorInfo *output_info = _output->info();
+ const Strides &output_strides = output_info->strides_in_bytes();
+
+ unsigned int idx = 0;
+
+ _kernel.setArg(idx++, _input->cl_buffer());
+ _kernel.setArg<cl_uint>(idx++, input_strides[1]);
+ _kernel.setArg<cl_uint>(idx++, input_strides[2]);
+ _kernel.setArg<cl_uint>(idx++, input_strides[3]);
+ _kernel.setArg<cl_uint>(idx++, input_strides[4]);
+ _kernel.setArg<cl_uint>(idx++, input_info->offset_first_element_in_bytes());
+
+ _kernel.setArg(idx++, _output->cl_buffer());
+ _kernel.setArg<cl_uint>(idx++, output_strides[1]);
+ _kernel.setArg<cl_uint>(idx++, output_strides[2]);
+ _kernel.setArg<cl_uint>(idx++, output_strides[4]);
+ _kernel.setArg<cl_uint>(idx++, output_info->offset_first_element_in_bytes());
+
+ enqueue(queue, *this, actual_window);
}
break;
default: