From c210c85548c7f627690ed9259622d3ab342fe612 Mon Sep 17 00:00:00 2001 From: Viet-Hoa Do Date: Mon, 9 Oct 2023 10:58:35 +0100 Subject: Optimize CL reduction operation * Batch dimension is added to reduction operation. - All the dimensions higher than the batch dimension are collapsed so that the input and output tensors are always 3-4D. - CL kernel is called once instead of being repeatedly called to process each sliding window. Resolves: COMPMID-6443 Signed-off-by: Viet-Hoa Do Change-Id: Icd99939d52d3bb648f08537e5f52ef27e894061b Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10456 Reviewed-by: Jakub Sujak Tested-by: Arm Jenkins Benchmark: Arm Jenkins Comments-Addressed: Arm Jenkins --- arm_compute/core/Window.h | 14 ++- arm_compute/core/Window.inl | 20 +++- docs/user_guide/release_version_and_change_log.dox | 1 + .../CL/cl_kernels/common/reduction_operation.cl | 92 ++++++++------- src/core/CL/kernels/CLReductionOperationKernel.cpp | 124 ++++++++++++++------- 5 files changed, 156 insertions(+), 95 deletions(-) diff --git a/arm_compute/core/Window.h b/arm_compute/core/Window.h index 4863b95045..e93d2863c9 100644 --- a/arm_compute/core/Window.h +++ b/arm_compute/core/Window.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_WINDOW_H -#define ARM_COMPUTE_WINDOW_H +#ifndef ACL_ARM_COMPUTE_CORE_WINDOW_H +#define ACL_ARM_COMPUTE_CORE_WINDOW_H #include "arm_compute/core/Coordinates.h" #include "arm_compute/core/Error.h" @@ -213,15 +213,17 @@ public: */ void shift(size_t dimension, int shift_value); - /** Shift down all the dimensions of a window + /** Shift down all the dimensions of a window starting from the specified dimension. * - * i.e new_dims[n] = old_dims[n+shift_value]. + * new_dims[i] = old_dims[i] for all i < start_dim. + * new_dims[i] = old_dims[i+shift_value] for all i >= start_dim. * * @param[in] shift_value Number of dimensions to shift the window by. + * @param[in] start_dim The dimension from which the dimensions start to shift. * * @return The window with the shifted dimensions. */ - Window shift_dimensions(unsigned int shift_value) const; + Window shift_dimensions(unsigned int shift_value, unsigned int start_dim = 0) const; /** Adjust the start or end of a given dimension by the given value * @@ -460,4 +462,4 @@ private: }; } // namespace arm_compute #include "Window.inl" -#endif /*ARM_COMPUTE_WINDOW_H */ +#endif // ACL_ARM_COMPUTE_CORE_WINDOW_H diff --git a/arm_compute/core/Window.inl b/arm_compute/core/Window.inl index d935507b1d..0f7c4fbdd7 100644 --- a/arm_compute/core/Window.inl +++ b/arm_compute/core/Window.inl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020, 2022 Arm Limited. + * Copyright (c) 2016-2020, 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -21,6 +21,10 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ + +#ifndef ACL_ARM_COMPUTE_CORE_WINDOW_INL +#define ACL_ARM_COMPUTE_CORE_WINDOW_INL + namespace arm_compute { inline Window::Window(const Window &src) @@ -100,13 +104,21 @@ inline Window Window::collapse_if_possible(const Window &full_window, return collapsed; } -inline Window Window::shift_dimensions(unsigned int shift_value) const +inline Window Window::shift_dimensions(unsigned int shift_value, unsigned int start_dim) const { Window shifted_window; - for (size_t n = 0; n < (Coordinates::num_max_dimensions - shift_value); n++) + size_t n = 0; + + for (; n < start_dim; ++n) + { + shifted_window.set(n, _dims[n]); + } + + for (; n < (Coordinates::num_max_dimensions - shift_value); n++) { shifted_window.set(n, _dims[n + shift_value]); } + return shifted_window; } @@ -313,3 +325,5 @@ inline bool operator==(const Window &lhs, const Window &rhs) return (lhs._dims == rhs._dims) && (lhs._is_broadcasted == rhs._is_broadcasted); } } // namespace arm_compute + +#endif // ACL_ARM_COMPUTE_CORE_WINDOW_INL diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox index d1429b61d7..b2500944ca 100644 --- a/docs/user_guide/release_version_and_change_log.dox +++ b/docs/user_guide/release_version_and_change_log.dox @@ -56,6 +56,7 @@ v23.11 Public major release - Optimize @ref cpu::CpuReshape - Optimize @ref opencl::ClTranspose - Optimize @ref NEStackLayer + - Optimize @ref CLReductionOperation. - Add new OpenCLâ„¢ kernels: - @ref opencl::kernels::ClMatMulLowpNativeMMULKernel support for QASYMM8 and QASYMM8_SIGNED, with batch support - Deprecate support for Bfloat16 in @ref cpu::CpuCast. diff --git a/src/core/CL/cl_kernels/common/reduction_operation.cl b/src/core/CL/cl_kernels/common/reduction_operation.cl index 1cb6664078..99369be19a 100644 --- a/src/core/CL/cl_kernels/common/reduction_operation.cl +++ b/src/core/CL/cl_kernels/common/reduction_operation.cl @@ -186,27 +186,28 @@ __kernel void reduction_operation_non_parallel_x( * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128 * * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes) * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor */ __kernel void reduction_operation_y( - IMAGE_DECLARATION(input), - IMAGE_DECLARATION(output)) + __global uchar *input_ptr, + uint input_stride_y, + uint input_stride_z, + uint input_offset_first_element_in_bytes, + + __global uchar *output_ptr, + uint output_stride_z, + uint output_offset_first_element_in_bytes) { int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); - int y = get_global_id(1); + int z = get_global_id(1); - __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y; - __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y; + __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + z * input_stride_z; + __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + z * output_stride_z; VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE) res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)); @@ -275,32 +276,33 @@ __kernel void reduction_operation_y( * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128 * * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the output tensor in W dimension (in bytes) * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor */ __kernel void reduction_operation_z( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) + __global uchar *input_ptr, + uint input_stride_y, + uint input_stride_z, + uint input_stride_w, + uint input_offset_first_element_in_bytes, + + __global uchar *output_ptr, + uint output_stride_y, + uint output_stride_w, + uint output_offset_first_element_in_bytes) { int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); int y = get_global_id(1); - int z = get_global_id(2); + int w = get_global_id(2); - __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + z * input_stride_z; - __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + z * output_stride_z; + __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + w * input_stride_w; + __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + w * output_stride_w; VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE) res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)); @@ -369,39 +371,43 @@ __kernel void reduction_operation_z( * * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128 - * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128 + * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128 * * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] input_stride_v Stride of the source tensor in V dimension (in bytes) * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the output tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_stride_v Stride of the output tensor in V dimension (in bytes) * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor */ __kernel void reduction_operation_w( - TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output)) + __global uchar *input_ptr, + uint input_stride_y, + uint input_stride_z, + uint input_stride_w, + uint input_stride_v, + uint input_offset_first_element_in_bytes, + + __global uchar *output_ptr, + uint output_stride_y, + uint output_stride_z, + uint output_stride_v, + uint output_offset_first_element_in_bytes) { int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); int y = get_global_id(1); - int z = get_global_id(2); - __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + (z % DEPTH) * input_stride_z + (z / DEPTH) * input_stride_w; - __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + (z % DEPTH) * output_stride_z + (z / DEPTH) * output_stride_z; + int gid_2 = get_global_id(2); + int z = get_global_id(2) % DEPTH; + int v = get_global_id(2) / DEPTH; + + __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + z * input_stride_z + v * input_stride_v; + __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + z * output_stride_z + v * output_stride_v; VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE) res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)); diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp index 70875a2d40..c8665f8fbd 100644 --- a/src/core/CL/kernels/CLReductionOperationKernel.cpp +++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp @@ -204,9 +204,10 @@ void CLReductionOperationKernel::configure(const CLCompileContext &compile_conte _kernel = create_kernel(compile_context, "reduction_operation_" + kernel_axis_name, build_opts.options()); // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(vec_size)); - win.set(Window::DimX, - Window::Dimension(win.x().start(), win.x().end() * _input->info()->num_channels(), win.x().step())); + TensorShape actual_input_shape = input->info()->tensor_shape(); + actual_input_shape[0] = width; + + Window win = calculate_max_window(actual_input_shape, Steps(vec_size)); ICLKernel::configure_internal(win); ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); @@ -272,55 +273,92 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que break; case 1: { - // Get first input and output slices - Window window_in{window}; - window_in.set(Window::DimY, - Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1))); - Window in_slice = window_in.first_slice_window_2D(); - Window out_slice = window.first_slice_window_2D(); - - do - { - unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, in_slice); - add_2D_tensor_argument(idx, _output, out_slice); - enqueue(queue, *this, in_slice); - } while (window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); + bool has_collapsed = true; + Window actual_window = window.collapse_if_possible(window, 2, &has_collapsed); + ARM_COMPUTE_ERROR_ON(!has_collapsed); + + actual_window = actual_window.shift_dimensions(1, Window::DimY); + + const ITensorInfo *input_info = _input->info(); + const Strides &input_strides = input_info->strides_in_bytes(); + + const ITensorInfo *output_info = _output->info(); + const Strides &output_strides = output_info->strides_in_bytes(); + + unsigned int idx = 0; + + _kernel.setArg(idx++, _input->cl_buffer()); + _kernel.setArg(idx++, input_strides[1]); + _kernel.setArg(idx++, input_strides[2]); + _kernel.setArg(idx++, input_info->offset_first_element_in_bytes()); + + _kernel.setArg(idx++, _output->cl_buffer()); + _kernel.setArg(idx++, output_strides[2]); + _kernel.setArg(idx++, output_info->offset_first_element_in_bytes()); + + enqueue(queue, *this, actual_window); } break; case 2: { - // Get first input and output slices - Window window_in{window}; - window_in.set(Window::DimZ, - Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2))); - Window in_slice = window_in.first_slice_window_3D(); - Window out_slice = window.first_slice_window_3D(); - - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, in_slice); - add_3D_tensor_argument(idx, _output, out_slice); - enqueue(queue, *this, in_slice); - } while (window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice)); + bool has_collapsed = true; + Window actual_window = window.collapse_if_possible(window, 3, &has_collapsed); + ARM_COMPUTE_ERROR_ON(!has_collapsed); + + actual_window = actual_window.shift_dimensions(1, Window::DimZ); + + const ITensorInfo *input_info = _input->info(); + const Strides &input_strides = input_info->strides_in_bytes(); + + const ITensorInfo *output_info = _output->info(); + const Strides &output_strides = output_info->strides_in_bytes(); + + unsigned int idx = 0; + + _kernel.setArg(idx++, _input->cl_buffer()); + _kernel.setArg(idx++, input_strides[1]); + _kernel.setArg(idx++, input_strides[2]); + _kernel.setArg(idx++, input_strides[3]); + _kernel.setArg(idx++, input_info->offset_first_element_in_bytes()); + + _kernel.setArg(idx++, _output->cl_buffer()); + _kernel.setArg(idx++, output_strides[1]); + _kernel.setArg(idx++, output_strides[3]); + _kernel.setArg(idx++, output_info->offset_first_element_in_bytes()); + + enqueue(queue, *this, actual_window); } break; case 3: { - // Get first input and output slices - Window window_in{window}; - window_in.set(3, Window::Dimension(0, 1, 1)); - Window in_slice = window_in.first_slice_window_4D(); - Window out_slice = window.first_slice_window_4D(); + bool has_collapsed = true; + Window actual_window = window.shift_dimensions(1, Window::DimW); - do - { - unsigned int idx = 0; - add_4D_tensor_argument(idx, _input, in_slice); - add_4D_tensor_argument(idx, _output, out_slice); - enqueue(queue, *this, in_slice); - } while (window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice)); + actual_window = actual_window.collapse_if_possible(actual_window, 2, &has_collapsed); + ARM_COMPUTE_ERROR_ON(!has_collapsed); + + const ITensorInfo *input_info = _input->info(); + const Strides &input_strides = input_info->strides_in_bytes(); + + const ITensorInfo *output_info = _output->info(); + const Strides &output_strides = output_info->strides_in_bytes(); + + unsigned int idx = 0; + + _kernel.setArg(idx++, _input->cl_buffer()); + _kernel.setArg(idx++, input_strides[1]); + _kernel.setArg(idx++, input_strides[2]); + _kernel.setArg(idx++, input_strides[3]); + _kernel.setArg(idx++, input_strides[4]); + _kernel.setArg(idx++, input_info->offset_first_element_in_bytes()); + + _kernel.setArg(idx++, _output->cl_buffer()); + _kernel.setArg(idx++, output_strides[1]); + _kernel.setArg(idx++, output_strides[2]); + _kernel.setArg(idx++, output_strides[4]); + _kernel.setArg(idx++, output_info->offset_first_element_in_bytes()); + + enqueue(queue, *this, actual_window); } break; default: -- cgit v1.2.1