aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichalis Spyrou <michalis.spyrou@arm.com>2018-12-10 16:19:20 +0000
committerMichalis Spyrou <michalis.spyrou@arm.com>2018-12-12 10:35:24 +0000
commit1f8db2be160718979d38e3671a135d22e83cc5c2 (patch)
tree6c999d7d3ac61ecbc1e2fbc7acd7ff47e9e1dbf9
parent3b0a2654034714c16f5930d2b24936d8be7b18a6 (diff)
downloadComputeLibrary-1f8db2be160718979d38e3671a135d22e83cc5c2.tar.gz
COMPMID-1786 Dispatch a single OpenCL when running CLScaleKernel with NHWC with batch_size!=1
Change-Id: Ib5ea76c1ba7a7add1f050ca9168091bd30749725 Reviewed-on: https://review.mlplatform.org/377 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
-rw-r--r--src/core/CL/cl_kernels/scale.cl36
-rw-r--r--src/core/CL/cl_kernels/scale_quantized.cl23
-rw-r--r--src/core/CL/kernels/CLScaleKernel.cpp19
3 files changed, 42 insertions, 36 deletions
diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/scale.cl
index 744f28a918..5ac6443c98 100644
--- a/src/core/CL/cl_kernels/scale.cl
+++ b/src/core/CL/cl_kernels/scale.cl
@@ -134,9 +134,11 @@ __kernel void scale_bilinear_nchw(
vstore4(bilinear_interpolate_with_border(&in, tc, input_width, input_height, BORDER_SIZE), 0, (__global DATA_TYPE *)out.ptr);
}
+#if defined(DEPTH_OUT)
/** Performs scale on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel F32. (NHWC)
*
* @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
*
* @param[in] in_ptr Pointer to the source image. Supported data types: U8/S16/F16/F32.
* @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
@@ -160,28 +162,29 @@ __kernel void scale_bilinear_nchw(
* @param[in] scale_y The scale factor along y dimension
*/
__kernel void scale_nearest_neighbour_nhwc(
- TENSOR3D_DECLARATION(in),
- TENSOR3D_DECLARATION(out),
+ TENSOR4D_DECLARATION(in),
+ TENSOR4D_DECLARATION(out),
const float input_width,
const float input_height,
const float scale_x,
const float scale_y)
{
- Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(in);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
const float new_x = (get_global_id(1) + 0.5f) * scale_x;
- const float new_y = (get_global_id(2) + 0.5f) * scale_y;
+ const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y;
const float clamped_x = clamp(new_x, 0.0f, input_width - 1);
const float clamped_y = clamp(new_y, 0.0f, input_height - 1);
- *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y)));
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT)));
}
/** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
*
* @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
* @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
+ * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
*
* @param[in] in_ptr Pointer to the source image. Supported data types: U8/S16/F16/F32.
* @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
@@ -205,22 +208,22 @@ __kernel void scale_nearest_neighbour_nhwc(
* @param[in] scale_y The scale factor along y dimension
*/
__kernel void scale_bilinear_nhwc(
- TENSOR3D_DECLARATION(in),
- TENSOR3D_DECLARATION(out),
+ TENSOR4D_DECLARATION(in),
+ TENSOR4D_DECLARATION(out),
const float input_width,
const float input_height,
const float scale_x,
const float scale_y)
{
- Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(in);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
#ifdef SAMPLING_POLICY_TOP_LEFT
const float new_x = get_global_id(1) * scale_x;
- const float new_y = get_global_id(2) * scale_y;
+ const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
#elif SAMPLING_POLICY_CENTER
const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
- const float new_y = (get_global_id(2) + 0.5f) * scale_y - 0.5f;
+ const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f;
#else /* SAMPLING_POLICY */
#error("Unsupported sampling policy");
#endif /* SAMPLING_POLICY */
@@ -241,10 +244,10 @@ __kernel void scale_bilinear_nhwc(
clamped_x1_ = select(clamped_x1_, 0.0f - BORDER_SIZE, new_xf + 1 < 0.f || new_xf + 1 > input_width - 1 || new_yf < 0.f || new_yf > input_height - 1);
#endif /* BORDER_MODE_REPLICATE */
- float4 ins = (float4)(*((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y))),
- *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y))),
- *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1))),
- *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1))));
+ float4 ins = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+ *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+ *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
+ *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
const float a = new_x - new_xf;
const float b = 1.f - a;
@@ -254,3 +257,4 @@ __kernel void scale_bilinear_nhwc(
*((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE);
}
+#endif /* defined(DEPTH_OUT) */ \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/scale_quantized.cl b/src/core/CL/cl_kernels/scale_quantized.cl
index 3211e7efa1..86dbf608f4 100644
--- a/src/core/CL/cl_kernels/scale_quantized.cl
+++ b/src/core/CL/cl_kernels/scale_quantized.cl
@@ -85,12 +85,14 @@ __kernel void scale_bilinear_quantized_nchw(
vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr);
}
+#if defined(DEPTH_OUT)
/** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
*
* @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
* @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
* @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
* @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
+ * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
*
* @param[in] in_ptr Pointer to the source image. Supported data types: QASYMM8.
* @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
@@ -114,22 +116,22 @@ __kernel void scale_bilinear_quantized_nchw(
* @param[in] scale_y The scale factor along y dimension
*/
__kernel void scale_bilinear_quantized_nhwc(
- TENSOR3D_DECLARATION(in),
- TENSOR3D_DECLARATION(out),
+ TENSOR4D_DECLARATION(in),
+ TENSOR4D_DECLARATION(out),
const float input_width,
const float input_height,
const float scale_x,
const float scale_y)
{
- Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(in);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
#ifdef SAMPLING_POLICY_TOP_LEFT
const float new_x = get_global_id(1) * scale_x;
- const float new_y = get_global_id(2) * scale_y;
+ const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
#elif SAMPLING_POLICY_CENTER
const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
- const float new_y = (get_global_id(2) + 0.5f) * scale_y - 0.5f;
+ const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f;
#else /* SAMPLING_POLICY */
#error("Unsupported sampling policy");
#endif /* SAMPLING_POLICY */
@@ -150,10 +152,10 @@ __kernel void scale_bilinear_quantized_nhwc(
clamped_x1_ = select(clamped_x1_, 0.0f - BORDER_SIZE, new_xf + 1 < 0.f || new_xf + 1 > input_width - 1 || new_yf < 0.f || new_yf > input_height - 1);
#endif /* BORDER_MODE_REPLICATE */
- int4 ins = (int4)(*((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y))),
- *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y))),
- *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1))),
- *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1))));
+ int4 ins = (int4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+ *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+ *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
+ *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
const float a = new_x - new_xf;
const float b = 1.f - a;
@@ -167,3 +169,4 @@ __kernel void scale_bilinear_quantized_nhwc(
*((__global DATA_TYPE *)out.ptr) = res;
}
+#endif /* defined(DEPTH_OUT) */ \ No newline at end of file
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index ce6c016154..96f4df88c1 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp
@@ -175,6 +175,7 @@ void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, Interpo
DataLayout data_layout = input->info()->data_layout();
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const bool is_nhwc = data_layout == DataLayout::NHWC;
// Compute the ratio between source width/height and destination width/height
const unsigned int input_width = input->info()->dimension(idx_width);
@@ -201,6 +202,7 @@ void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, Interpo
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option("-DBORDER_SIZE=" + support::cpp11::to_string(border.right));
build_opts.add_option_if(border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
+ build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
build_opts.add_option_if_else(sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
if(call_quantized_kernel)
{
@@ -215,7 +217,7 @@ void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, Interpo
kernel_name += lower_string(string_from_data_layout(data_layout));
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
- unsigned int idx = data_layout == DataLayout::NHWC ? 2 * num_arguments_per_3D_tensor() : 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ unsigned int idx = is_nhwc ? 2 * num_arguments_per_4D_tensor() : 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
// Set static kernel arguments
const float scale_x = static_cast<float>(input_width) / output_width;
@@ -250,16 +252,13 @@ void CLScaleKernel::run(const Window &window, cl::CommandQueue &queue)
}
case DataLayout::NHWC:
{
- Window slice = window.first_slice_window_3D();
+ Window collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
+ Window slice = collapsed.first_slice_window_4D();
- do
- {
- unsigned int idx = 0;
- add_3D_tensor_argument(idx, _input, slice);
- add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice));
+ unsigned int idx = 0;
+ add_4D_tensor_argument(idx, _input, slice);
+ add_4D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice, lws_hint());
break;
}
default: