From 7ff47a313d62011923c5307cc52510b4ba41a631 Mon Sep 17 00:00:00 2001 From: Anthony Barbier Date: Tue, 11 Jul 2017 16:54:04 +0100 Subject: COMPMID-443: Use 3D tensors for fill_border_image 2x performance improvement on some GoogLeNet Pooling tests Change-Id: If75336aa6308731a06462a73cd9209d24574509e Reviewed-on: http://mpd-gerrit.cambridge.arm.com/80342 Tested-by: Kaizen Reviewed-by: Steven Niu Reviewed-by: Pablo Tello Reviewed-by: Georgios Pinitas --- src/core/CL/cl_kernels/fill_border.cl | 12 ++++++++---- src/core/CL/cl_kernels/helpers.h | 29 +++++++++++++++++++++++++++++ src/core/CL/kernels/CLFillBorderKernel.cpp | 8 ++++---- 3 files changed, 41 insertions(+), 8 deletions(-) diff --git a/src/core/CL/cl_kernels/fill_border.cl b/src/core/CL/cl_kernels/fill_border.cl index df635869b1..5fbe3ea070 100644 --- a/src/core/CL/cl_kernels/fill_border.cl +++ b/src/core/CL/cl_kernels/fill_border.cl @@ -36,18 +36,20 @@ * @param[in] buf_step_x buf_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] buf_stride_y Stride of the source image in Y dimension (in bytes) * @param[in] buf_step_y buf_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] buf_stride_z Stride between images if batching images (in bytes) + * @param[in] buf_step_z buf_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] buf_offset_first_element_in_bytes The offset of the first element in the source image * @param[in] width Width of the valid region of the image * @param[in] height Height of the valid region of the image * @param[in] start_pos XY coordinate indicating the start point of the valid region */ __kernel void fill_image_borders_replicate( - IMAGE_DECLARATION(buf), + TENSOR3D_DECLARATION(buf), uint width, uint height, int2 start_pos) { - Image buf = CONVERT_TO_IMAGE_STRUCT_NO_STEP(buf); + Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf); // Update pointer to point to the starting point of the valid region buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x; @@ -109,6 +111,8 @@ __kernel void fill_image_borders_replicate( * @param[in] buf_step_x buf_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] buf_stride_y Stride of the source image in Y dimension (in bytes) * @param[in] buf_step_y buf_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] buf_stride_z Stride between images if batching images (in bytes) + * @param[in] buf_step_z buf_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] buf_offset_first_element_in_bytes The offset of the first element in the source image * @param[in] width Width of the valid region of the image * @param[in] height Height of the valid region of the image @@ -116,13 +120,13 @@ __kernel void fill_image_borders_replicate( * @param[in] constant_value Constant value to use to fill the edges */ __kernel void fill_image_borders_constant( - IMAGE_DECLARATION(buf), + TENSOR3D_DECLARATION(buf), uint width, uint height, int2 start_pos, DATA_TYPE constant_value) { - Image buf = CONVERT_TO_IMAGE_STRUCT_NO_STEP(buf); + Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf); // Update pointer to point to the starting point of the valid region buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x; diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h index cf3cb78e04..29a43f769b 100644 --- a/src/core/CL/cl_kernels/helpers.h +++ b/src/core/CL/cl_kernels/helpers.h @@ -78,6 +78,9 @@ #define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0) +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z) + #define CONVERT_TO_TENSOR3D_STRUCT(name) \ update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ name##_stride_z, name##_step_z) @@ -157,6 +160,32 @@ Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_el return img; } +/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] stride_z Stride of the image in Z dimension (in bytes) + * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) + * + * @return A 3D tensor object + */ +Image inline update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) +{ + Image img = + { + .ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y + }; + img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z; + return img; +} + /** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data. * * @param[in] ptr Pointer to the starting postion of the buffer diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp index 7683ff9a49..2c751a489c 100644 --- a/src/core/CL/kernels/CLFillBorderKernel.cpp +++ b/src/core/CL/kernels/CLFillBorderKernel.cpp @@ -108,7 +108,7 @@ void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, Bo const unsigned int total_valid_width = border_size.left + valid_width + border_size.right; // Set static kernel arguments - unsigned int idx = num_arguments_per_2D_tensor(); //Skip the tensor parameters + unsigned int idx = num_arguments_per_3D_tensor(); //Skip the tensor parameters ICLKernel::add_argument(idx, valid_width); ICLKernel::add_argument(idx, valid_height); ICLKernel::add_argument(idx, valid_region_coords); @@ -163,13 +163,13 @@ void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - Window slice = window.first_slice_window_2D(); + Window slice = window.first_slice_window_3D(); do { unsigned int idx = 0; - add_2D_tensor_argument(idx, _tensor, slice); + add_3D_tensor_argument(idx, _tensor, slice); enqueue(queue, *this, slice, cl::NullRange); } - while(window.slide_window_slice_2D(slice)); + while(window.slide_window_slice_3D(slice)); } -- cgit v1.2.1