aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnthony Barbier <anthony.barbier@arm.com>2017-07-11 16:54:04 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-09-17 14:15:39 +0100
commit7ff47a313d62011923c5307cc52510b4ba41a631 (patch)
tree1ac596f79202d51bfe2c8217c9a2e9f12b68337e
parentda37e2fa661f85090bd4a3b93c07178df268bd8c (diff)
downloadComputeLibrary-7ff47a313d62011923c5307cc52510b4ba41a631.tar.gz
COMPMID-443: Use 3D tensors for fill_border_image
2x performance improvement on some GoogLeNet Pooling tests Change-Id: If75336aa6308731a06462a73cd9209d24574509e Reviewed-on: http://mpd-gerrit.cambridge.arm.com/80342 Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Steven Niu <steven.niu@arm.com> Reviewed-by: Pablo Tello <pablo.tello@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
-rw-r--r--src/core/CL/cl_kernels/fill_border.cl12
-rw-r--r--src/core/CL/cl_kernels/helpers.h29
-rw-r--r--src/core/CL/kernels/CLFillBorderKernel.cpp8
3 files changed, 41 insertions, 8 deletions
diff --git a/src/core/CL/cl_kernels/fill_border.cl b/src/core/CL/cl_kernels/fill_border.cl
index df635869b1..5fbe3ea070 100644
--- a/src/core/CL/cl_kernels/fill_border.cl
+++ b/src/core/CL/cl_kernels/fill_border.cl
@@ -36,18 +36,20 @@
* @param[in] buf_step_x buf_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] buf_stride_y Stride of the source image in Y dimension (in bytes)
* @param[in] buf_step_y buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] buf_stride_z Stride between images if batching images (in bytes)
+ * @param[in] buf_step_z buf_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] buf_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] width Width of the valid region of the image
* @param[in] height Height of the valid region of the image
* @param[in] start_pos XY coordinate indicating the start point of the valid region
*/
__kernel void fill_image_borders_replicate(
- IMAGE_DECLARATION(buf),
+ TENSOR3D_DECLARATION(buf),
uint width,
uint height,
int2 start_pos)
{
- Image buf = CONVERT_TO_IMAGE_STRUCT_NO_STEP(buf);
+ Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
// Update pointer to point to the starting point of the valid region
buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
@@ -109,6 +111,8 @@ __kernel void fill_image_borders_replicate(
* @param[in] buf_step_x buf_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] buf_stride_y Stride of the source image in Y dimension (in bytes)
* @param[in] buf_step_y buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] buf_stride_z Stride between images if batching images (in bytes)
+ * @param[in] buf_step_z buf_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] buf_offset_first_element_in_bytes The offset of the first element in the source image
* @param[in] width Width of the valid region of the image
* @param[in] height Height of the valid region of the image
@@ -116,13 +120,13 @@ __kernel void fill_image_borders_replicate(
* @param[in] constant_value Constant value to use to fill the edges
*/
__kernel void fill_image_borders_constant(
- IMAGE_DECLARATION(buf),
+ TENSOR3D_DECLARATION(buf),
uint width,
uint height,
int2 start_pos,
DATA_TYPE constant_value)
{
- Image buf = CONVERT_TO_IMAGE_STRUCT_NO_STEP(buf);
+ Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
// Update pointer to point to the starting point of the valid region
buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index cf3cb78e04..29a43f769b 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h
@@ -78,6 +78,9 @@
#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
+ update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+
#define CONVERT_TO_TENSOR3D_STRUCT(name) \
update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
name##_stride_z, name##_step_z)
@@ -157,6 +160,32 @@ Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_el
return img;
}
+/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
+ *
+ * @param[in] ptr Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x Stride of the image in X dimension (in bytes)
+ * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+Image inline update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+ Image img =
+ {
+ .ptr = ptr,
+ .offset_first_element_in_bytes = offset_first_element_in_bytes,
+ .stride_x = stride_x,
+ .stride_y = stride_y
+ };
+ img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+ return img;
+}
+
/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
*
* @param[in] ptr Pointer to the starting postion of the buffer
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index 7683ff9a49..2c751a489c 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -108,7 +108,7 @@ void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, Bo
const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
// Set static kernel arguments
- unsigned int idx = num_arguments_per_2D_tensor(); //Skip the tensor parameters
+ unsigned int idx = num_arguments_per_3D_tensor(); //Skip the tensor parameters
ICLKernel::add_argument<cl_uint>(idx, valid_width);
ICLKernel::add_argument<cl_uint>(idx, valid_height);
ICLKernel::add_argument<cl_int2>(idx, valid_region_coords);
@@ -163,13 +163,13 @@ void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
- Window slice = window.first_slice_window_2D();
+ Window slice = window.first_slice_window_3D();
do
{
unsigned int idx = 0;
- add_2D_tensor_argument(idx, _tensor, slice);
+ add_3D_tensor_argument(idx, _tensor, slice);
enqueue(queue, *this, slice, cl::NullRange);
}
- while(window.slide_window_slice_2D(slice));
+ while(window.slide_window_slice_3D(slice));
}