From 1f8db2be160718979d38e3671a135d22e83cc5c2 Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Mon, 10 Dec 2018 16:19:20 +0000 Subject: COMPMID-1786 Dispatch a single OpenCL when running CLScaleKernel with NHWC with batch_size!=1 Change-Id: Ib5ea76c1ba7a7add1f050ca9168091bd30749725 Reviewed-on: https://review.mlplatform.org/377 Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio Reviewed-by: Gian Marco Iodice --- src/core/CL/cl_kernels/scale.cl | 36 +++++++++++++++++-------------- src/core/CL/cl_kernels/scale_quantized.cl | 23 +++++++++++--------- src/core/CL/kernels/CLScaleKernel.cpp | 19 ++++++++-------- 3 files changed, 42 insertions(+), 36 deletions(-) diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/scale.cl index 744f28a918..5ac6443c98 100644 --- a/src/core/CL/cl_kernels/scale.cl +++ b/src/core/CL/cl_kernels/scale.cl @@ -134,9 +134,11 @@ __kernel void scale_bilinear_nchw( vstore4(bilinear_interpolate_with_border(&in, tc, input_width, input_height, BORDER_SIZE), 0, (__global DATA_TYPE *)out.ptr); } +#if defined(DEPTH_OUT) /** Performs scale on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel F32. (NHWC) * * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT + * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16 * * @param[in] in_ptr Pointer to the source image. Supported data types: U8/S16/F16/F32. * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) @@ -160,28 +162,29 @@ __kernel void scale_bilinear_nchw( * @param[in] scale_y The scale factor along y dimension */ __kernel void scale_nearest_neighbour_nhwc( - TENSOR3D_DECLARATION(in), - TENSOR3D_DECLARATION(out), + TENSOR4D_DECLARATION(in), + TENSOR4D_DECLARATION(out), const float input_width, const float input_height, const float scale_x, const float scale_y) { - Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(in); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT); const float new_x = (get_global_id(1) + 0.5f) * scale_x; - const float new_y = (get_global_id(2) + 0.5f) * scale_y; + const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y; const float clamped_x = clamp(new_x, 0.0f, input_width - 1); const float clamped_y = clamp(new_y, 0.0f, input_height - 1); - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y))); + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))); } /** Performs scale on an image interpolating with the BILINEAR method. (NHWC) * * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE + * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16 * * @param[in] in_ptr Pointer to the source image. Supported data types: U8/S16/F16/F32. * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) @@ -205,22 +208,22 @@ __kernel void scale_nearest_neighbour_nhwc( * @param[in] scale_y The scale factor along y dimension */ __kernel void scale_bilinear_nhwc( - TENSOR3D_DECLARATION(in), - TENSOR3D_DECLARATION(out), + TENSOR4D_DECLARATION(in), + TENSOR4D_DECLARATION(out), const float input_width, const float input_height, const float scale_x, const float scale_y) { - Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(in); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT); #ifdef SAMPLING_POLICY_TOP_LEFT const float new_x = get_global_id(1) * scale_x; - const float new_y = get_global_id(2) * scale_y; + const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y; #elif SAMPLING_POLICY_CENTER const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f; - const float new_y = (get_global_id(2) + 0.5f) * scale_y - 0.5f; + const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f; #else /* SAMPLING_POLICY */ #error("Unsupported sampling policy"); #endif /* SAMPLING_POLICY */ @@ -241,10 +244,10 @@ __kernel void scale_bilinear_nhwc( clamped_x1_ = select(clamped_x1_, 0.0f - BORDER_SIZE, new_xf + 1 < 0.f || new_xf + 1 > input_width - 1 || new_yf < 0.f || new_yf > input_height - 1); #endif /* BORDER_MODE_REPLICATE */ - float4 ins = (float4)(*((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y))), - *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y))), - *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1))), - *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1)))); + float4 ins = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT)))); const float a = new_x - new_xf; const float b = 1.f - a; @@ -254,3 +257,4 @@ __kernel void scale_bilinear_nhwc( *((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE); } +#endif /* defined(DEPTH_OUT) */ \ No newline at end of file diff --git a/src/core/CL/cl_kernels/scale_quantized.cl b/src/core/CL/cl_kernels/scale_quantized.cl index 3211e7efa1..86dbf608f4 100644 --- a/src/core/CL/cl_kernels/scale_quantized.cl +++ b/src/core/CL/cl_kernels/scale_quantized.cl @@ -85,12 +85,14 @@ __kernel void scale_bilinear_quantized_nchw( vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr); } +#if defined(DEPTH_OUT) /** Performs scale on an image interpolating with the BILINEAR method. (NHWC) * * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT * @note Scale value for QASYMM8 data type to used is passed as -DSCALE= e.g. -DSCALE=0.5 * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET= e.g. -DOFFSET=1 * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE + * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16 * * @param[in] in_ptr Pointer to the source image. Supported data types: QASYMM8. * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) @@ -114,22 +116,22 @@ __kernel void scale_bilinear_quantized_nchw( * @param[in] scale_y The scale factor along y dimension */ __kernel void scale_bilinear_quantized_nhwc( - TENSOR3D_DECLARATION(in), - TENSOR3D_DECLARATION(out), + TENSOR4D_DECLARATION(in), + TENSOR4D_DECLARATION(out), const float input_width, const float input_height, const float scale_x, const float scale_y) { - Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(in); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT); #ifdef SAMPLING_POLICY_TOP_LEFT const float new_x = get_global_id(1) * scale_x; - const float new_y = get_global_id(2) * scale_y; + const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y; #elif SAMPLING_POLICY_CENTER const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f; - const float new_y = (get_global_id(2) + 0.5f) * scale_y - 0.5f; + const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f; #else /* SAMPLING_POLICY */ #error("Unsupported sampling policy"); #endif /* SAMPLING_POLICY */ @@ -150,10 +152,10 @@ __kernel void scale_bilinear_quantized_nhwc( clamped_x1_ = select(clamped_x1_, 0.0f - BORDER_SIZE, new_xf + 1 < 0.f || new_xf + 1 > input_width - 1 || new_yf < 0.f || new_yf > input_height - 1); #endif /* BORDER_MODE_REPLICATE */ - int4 ins = (int4)(*((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y))), - *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y))), - *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1))), - *((__global DATA_TYPE *)tensor3D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1)))); + int4 ins = (int4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT)))); const float a = new_x - new_xf; const float b = 1.f - a; @@ -167,3 +169,4 @@ __kernel void scale_bilinear_quantized_nhwc( *((__global DATA_TYPE *)out.ptr) = res; } +#endif /* defined(DEPTH_OUT) */ \ No newline at end of file diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp index ce6c016154..96f4df88c1 100644 --- a/src/core/CL/kernels/CLScaleKernel.cpp +++ b/src/core/CL/kernels/CLScaleKernel.cpp @@ -175,6 +175,7 @@ void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, Interpo DataLayout data_layout = input->info()->data_layout(); const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const bool is_nhwc = data_layout == DataLayout::NHWC; // Compute the ratio between source width/height and destination width/height const unsigned int input_width = input->info()->dimension(idx_width); @@ -201,6 +202,7 @@ void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, Interpo build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); build_opts.add_option("-DBORDER_SIZE=" + support::cpp11::to_string(border.right)); build_opts.add_option_if(border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE"); + build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); build_opts.add_option_if_else(sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT"); if(call_quantized_kernel) { @@ -215,7 +217,7 @@ void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, Interpo kernel_name += lower_string(string_from_data_layout(data_layout)); _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); - unsigned int idx = data_layout == DataLayout::NHWC ? 2 * num_arguments_per_3D_tensor() : 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters + unsigned int idx = is_nhwc ? 2 * num_arguments_per_4D_tensor() : 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters // Set static kernel arguments const float scale_x = static_cast(input_width) / output_width; @@ -250,16 +252,13 @@ void CLScaleKernel::run(const Window &window, cl::CommandQueue &queue) } case DataLayout::NHWC: { - Window slice = window.first_slice_window_3D(); + Window collapsed = window.collapse(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_4D(); - do - { - unsigned int idx = 0; - add_3D_tensor_argument(idx, _input, slice); - add_3D_tensor_argument(idx, _output, slice); - enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + unsigned int idx = 0; + add_4D_tensor_argument(idx, _input, slice); + add_4D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice, lws_hint()); break; } default: -- cgit v1.2.1