aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2018-01-03 12:29:22 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:42:33 +0000
commit1d08a310b7316f2b731e60ac36dc68989d15b546 (patch)
tree2fe747eb22a5a094bbcef8f7519eef9d4b6172db /src
parent2c350181118ec9eca864432c5bd78a0cfc3ebc32 (diff)
downloadComputeLibrary-1d08a310b7316f2b731e60ac36dc68989d15b546.tar.gz
COMPMID-765: Collapse execution window in CL kernels.
Updated following kernels to collapse their execution window and reduce number of kernel enqueues: -CLArithmeticAddition -CLArithmeticSubtraction -CLPixelWiseMultiplication Change-Id: I13d503515a20fa9be1401ead1e27e9bbc6627975 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/114878 Reviewed-by: Anthony Barbier <anthony.barbier@arm.com> Tested-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/core/CL/cl_kernels/arithmetic_op.cl88
-rw-r--r--src/core/CL/kernels/CLArithmeticAdditionKernel.cpp12
-rw-r--r--src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp11
-rw-r--r--src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp5
4 files changed, 66 insertions, 50 deletions
diff --git a/src/core/CL/cl_kernels/arithmetic_op.cl b/src/core/CL/cl_kernels/arithmetic_op.cl
index 03414105e6..12963473c5 100644
--- a/src/core/CL/cl_kernels/arithmetic_op.cl
+++ b/src/core/CL/cl_kernels/arithmetic_op.cl
@@ -35,40 +35,46 @@
#define SUB(x, y) (x) - (y)
#endif /* SATURATE */
-/** This function add two images.
+/** This function adds two tensors.
*
* @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
* e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
* @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
*
- * @param[in] in1_ptr Pointer to the source image. Supported data types: U8/QS8/QS16/S16/F16/F32
- * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in1_ptr Pointer to the source tensor. Supported data types: U8/QS8/QS16/S16/F16/F32
+ * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] in2_ptr Pointer to the source image. Supported data types: U8/QS8 (only if @p in1_ptr is QS8), QS16 (only if @p in1_ptr is QS16), S16/F16/F32
- * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] in2_ptr Pointer to the source tensor. Supported data types: U8/QS8 (only if @p in1_ptr is QS8), QS16 (only if @p in1_ptr is QS16), S16/F16/F32
+ * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr Pointer to the destination image. Supported data types: U8 (only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32
- * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr Pointer to the destination tensor. Supported data types: U8 (only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32
+ * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void arithmetic_add(
- IMAGE_DECLARATION(in1),
- IMAGE_DECLARATION(in2),
- IMAGE_DECLARATION(out))
+ TENSOR3D_DECLARATION(in1),
+ TENSOR3D_DECLARATION(in2),
+ TENSOR3D_DECLARATION(out))
{
// Get pixels pointer
- Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
- Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
- Image out = CONVERT_TO_IMAGE_STRUCT(out);
+ Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+ Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
// Load values
VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
@@ -80,40 +86,46 @@ __kernel void arithmetic_add(
vstore16(ADD(in_a, in_b), 0, (__global DATA_TYPE_OUT *)out.ptr);
}
-/** This function subtracts one image from another.
+/** This function subtracts one tensors from another.
*
* @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
* e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=short
* @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
*
- * @param[in] in1_ptr Pointer to the source image. Supported data types: U8, S16
- * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in1_ptr Pointer to the source tensor. Supported data types: U8, S16
+ * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] in2_ptr Pointer to the source image. Supported data types: U8, S16
- * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] in2_ptr Pointer to the source tensor. Supported data types: U8, S16
+ * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16
- * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr Pointer to the destination tensor. Supported data types: U8, S16
+ * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes)
* @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
__kernel void arithmetic_sub(
- IMAGE_DECLARATION(in1),
- IMAGE_DECLARATION(in2),
- IMAGE_DECLARATION(out))
+ TENSOR3D_DECLARATION(in1),
+ TENSOR3D_DECLARATION(in2),
+ TENSOR3D_DECLARATION(out))
{
// Get pixels pointer
- Image in1 = CONVERT_TO_IMAGE_STRUCT(in1);
- Image in2 = CONVERT_TO_IMAGE_STRUCT(in2);
- Image out = CONVERT_TO_IMAGE_STRUCT(out);
+ Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+ Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
// Load values
VEC_DATA_TYPE(DATA_TYPE_OUT, 16)
diff --git a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
index 2789573293..75701ee011 100644
--- a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
+++ b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
@@ -154,14 +154,16 @@ void CLArithmeticAdditionKernel::run(const Window &window, cl::CommandQueue &que
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- Window slice = window.first_slice_window_2D();
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
+
do
{
unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input1, slice);
- add_2D_tensor_argument(idx, _input2, slice);
- add_2D_tensor_argument(idx, _output, slice);
+ add_3D_tensor_argument(idx, _input1, slice);
+ add_3D_tensor_argument(idx, _input2, slice);
+ add_3D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice);
}
- while(window.slide_window_slice_2D(slice));
+ while(collapsed.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
index cc2ef1f023..8308aa0767 100644
--- a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
+++ b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
@@ -146,15 +146,16 @@ void CLArithmeticSubtractionKernel::run(const Window &window, cl::CommandQueue &
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- Window slice = window.first_slice_window_2D();
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
do
{
unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input1, slice);
- add_2D_tensor_argument(idx, _input2, slice);
- add_2D_tensor_argument(idx, _output, slice);
+ add_3D_tensor_argument(idx, _input1, slice);
+ add_3D_tensor_argument(idx, _input2, slice);
+ add_3D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice);
}
- while(window.slide_window_slice_2D(slice));
+ while(collapsed.slide_window_slice_3D(slice));
}
diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
index fd5e5d5862..6dba9c0f95 100644
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
@@ -227,7 +227,8 @@ void CLPixelWiseMultiplicationKernel::run(const Window &window, cl::CommandQueue
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- Window slice = window.first_slice_window_3D();
+ Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_3D();
do
{
@@ -237,5 +238,5 @@ void CLPixelWiseMultiplicationKernel::run(const Window &window, cl::CommandQueue
add_3D_tensor_argument(idx, _output, slice);
enqueue(queue, *this, slice);
}
- while(window.slide_window_slice_3D(slice));
+ while(collapsed.slide_window_slice_3D(slice));
}