From 0c58f99495844c6ace629116451dae00e8c27418 Mon Sep 17 00:00:00 2001 From: Manuel Bottini Date: Thu, 3 Dec 2020 16:26:35 +0000 Subject: Remove OpenCL padding CLScaleKernel Resolves COMPMID-3918 Change-Id: I970b1eaf2ae6f2f5a8cfc318cd1a3dfd3ba36fdb Signed-off-by: Manuel Bottini Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4668 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Giorgio Arena --- src/core/CL/cl_kernels/scale.cl | 51 +++++++++++++++++++------------ src/core/CL/cl_kernels/scale_quantized.cl | 49 ++++++++++++++++++----------- src/core/CL/kernels/CLScaleKernel.cpp | 22 ++++++------- 3 files changed, 72 insertions(+), 50 deletions(-) (limited to 'src/core') diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/scale.cl index a01ff89a4f..d4c27e6cf6 100644 --- a/src/core/CL/cl_kernels/scale.cl +++ b/src/core/CL/cl_kernels/scale.cl @@ -189,8 +189,8 @@ __kernel void scale_nearest_neighbour_nhwc( float new_x = get_global_id(1) * scale_x; float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y; #elif SAMPLING_POLICY_CENTER - float new_x = (get_global_id(1) + 0.5f) * scale_x; - float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y; + float new_x = (get_global_id(1) + 0.5f) * scale_x; + float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y; #else /* SAMPLING_POLICY */ #error("Unsupported sampling policy"); #endif /* SAMPLING_POLICY */ @@ -209,6 +209,7 @@ __kernel void scale_nearest_neighbour_nhwc( * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16 + * @note The value to be used at the edges of the images shoud be given as a preprocessor argument using -DCONSTANT_VALUE=value. * * @param[in] in_ptr Pointer to the source image. Supported data types: U8/S16/F16/F32. * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) @@ -230,6 +231,7 @@ __kernel void scale_nearest_neighbour_nhwc( * @param[in] input_height Input image height * @param[in] scale_x The scale factor along x dimension * @param[in] scale_y The scale factor along y dimension + * */ __kernel void scale_bilinear_nhwc( TENSOR4D_DECLARATION(in), @@ -252,27 +254,38 @@ __kernel void scale_bilinear_nhwc( #error("Unsupported sampling policy"); #endif /* SAMPLING_POLICY */ - const float new_xf = floor(new_x); - const float new_yf = floor(new_y); - float clamped_x = clamp(new_xf, 0.0f, input_width - 1); - float clamped_x1 = clamp(new_xf + 1, 0.0f, input_width - 1); - float clamped_x_ = clamped_x; - float clamped_x1_ = clamped_x1; - const float clamped_y = clamp(new_yf, 0.0f, input_height - 1); - const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1); + const float new_xf = floor(new_x); + const float new_yf = floor(new_y); + const float clamped_x = clamp(new_xf, 0.0f, input_width - 1); + const float clamped_x1 = clamp(new_xf + 1, 0.0f, input_width - 1); + const float clamped_y = clamp(new_yf, 0.0f, input_height - 1); + const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1); #ifndef BORDER_MODE_REPLICATE - clamped_x1 = select(clamped_x1, 0.0f - BORDER_SIZE, new_yf + 1 < 0.f || new_yf + 1 > input_height - 1 || new_xf + 1 < 0.f || new_xf + 1 > input_width - 1); - clamped_x_ = select(clamped_x_, 0.0f - BORDER_SIZE, new_yf + 1 > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1); - clamped_x = select(clamped_x, 0.0f - BORDER_SIZE, new_yf < 0.f || new_yf > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1); - clamped_x1_ = select(clamped_x1_, 0.0f - BORDER_SIZE, new_xf + 1 < 0.f || new_xf + 1 > input_width - 1 || new_yf < 0.f || new_yf > input_height - 1); + const bool check_x = (0.f <= new_xf && new_xf < input_width); + const bool check_x1 = (-1.f <= new_xf && new_xf < input_width - 1); + const bool check_y = (0.f <= new_yf && new_yf < input_height); + const bool check_y1 = (-1.f <= new_yf && new_yf < input_height - 1); + const float ins_0 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), + (get_global_id(2) / DEPTH_OUT)))), + check_x && check_y); + const float ins_1 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), + (get_global_id(2) / DEPTH_OUT)))), + check_x1 && check_y); + const float ins_2 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), + (get_global_id(2) / DEPTH_OUT)))), + check_x && check_y1); + const float ins_3 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), + (get_global_id(2) / DEPTH_OUT)))), + check_x1 && check_y1); + float4 ins = (float4)(ins_0, ins_1, ins_2, ins_3); +#else /* BORDER_MODE_REPLICATE */ + float4 ins = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT)))); #endif /* BORDER_MODE_REPLICATE */ - float4 ins = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), - *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), - *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))), - *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT)))); - const float a = new_x - new_xf; const float b = 1.f - a; const float a1 = new_y - new_yf; diff --git a/src/core/CL/cl_kernels/scale_quantized.cl b/src/core/CL/cl_kernels/scale_quantized.cl index 2aa7f185c6..010e4ed57a 100644 --- a/src/core/CL/cl_kernels/scale_quantized.cl +++ b/src/core/CL/cl_kernels/scale_quantized.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2018-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -93,6 +93,7 @@ __kernel void scale_bilinear_quantized_nchw( * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET= e.g. -DOFFSET=1 * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16 + * @note The value to be used at the edges of the images shoud be given as a preprocessor argument using -DCONSTANT_VALUE=value. * * @param[in] in_ptr Pointer to the source image. Supported data types: QASYMM8. * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) @@ -114,6 +115,7 @@ __kernel void scale_bilinear_quantized_nchw( * @param[in] input_height Input image height * @param[in] scale_x The scale factor along x dimension * @param[in] scale_y The scale factor along y dimension + * @param[in] constant_border_value Constant border value to use */ __kernel void scale_bilinear_quantized_nhwc( TENSOR4D_DECLARATION(in), @@ -136,27 +138,38 @@ __kernel void scale_bilinear_quantized_nhwc( #error("Unsupported sampling policy"); #endif /* SAMPLING_POLICY */ - const float new_xf = floor(new_x); - const float new_yf = floor(new_y); - float clamped_x = clamp(new_xf, 0.0f, input_width - 1); - float clamped_x1 = clamp(new_xf + 1, 0.0f, input_width - 1); - float clamped_x_ = clamped_x; - float clamped_x1_ = clamped_x1; - const float clamped_y = clamp(new_yf, 0.0f, input_height - 1); - const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1); + const float new_xf = floor(new_x); + const float new_yf = floor(new_y); + const float clamped_x = clamp(new_xf, 0.0f, input_width - 1); + const float clamped_x1 = clamp(new_xf + 1, 0.0f, input_width - 1); + const float clamped_y = clamp(new_yf, 0.0f, input_height - 1); + const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1); #ifndef BORDER_MODE_REPLICATE - clamped_x1 = select(clamped_x1, 0.0f - BORDER_SIZE, new_yf + 1 < 0.f || new_yf + 1 > input_height - 1 || new_xf + 1 < 0.f || new_xf + 1 > input_width - 1); - clamped_x_ = select(clamped_x_, 0.0f - BORDER_SIZE, new_yf + 1 > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1); - clamped_x = select(clamped_x, 0.0f - BORDER_SIZE, new_yf < 0.f || new_yf > input_height - 1 || new_xf < 0.f || new_xf > input_width - 1); - clamped_x1_ = select(clamped_x1_, 0.0f - BORDER_SIZE, new_xf + 1 < 0.f || new_xf + 1 > input_width - 1 || new_yf < 0.f || new_yf > input_height - 1); + const bool check_x = (0.f <= new_xf && new_xf < input_width); + const bool check_x1 = (-1.f <= new_xf && new_xf < input_width - 1); + const bool check_y = (0.f <= new_yf && new_yf < input_height); + const bool check_y1 = (-1.f <= new_yf && new_yf < input_height - 1); + const int ins_0 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), + (get_global_id(2) / DEPTH_OUT)))), + check_x && check_y); + const int ins_1 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), + (get_global_id(2) / DEPTH_OUT)))), + check_x1 && check_y); + const int ins_2 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), + (get_global_id(2) / DEPTH_OUT)))), + check_x && check_y1); + const int ins_3 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), + (get_global_id(2) / DEPTH_OUT)))), + check_x1 && check_y1); + int4 ins = (int4)(ins_0, ins_1, ins_2, ins_3); +#else /* BORDER_MODE_REPLICATE */ + int4 ins = (int4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT)))); #endif /* BORDER_MODE_REPLICATE */ - int4 ins = (int4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), - *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1_), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), - *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x_), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))), - *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT)))); - const float a = new_x - new_xf; const float b = 1.f - a; const float a1 = new_y - new_yf; diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp index 5a7d5830fd..f3d2fa12d5 100644 --- a/src/core/CL/kernels/CLScaleKernel.cpp +++ b/src/core/CL/kernels/CLScaleKernel.cpp @@ -120,15 +120,8 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen break; case DataLayout::NHWC: { - num_elems_processed_per_iteration = 1; // Configure kernel window - win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); - AccessWindowStatic input_access(input, -border.left, -border.top, - input->dimension(0) + border.right, - input->dimension(1) + border.bottom); - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - window_changed = update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + win = calculate_max_window(*output, Steps()); } break; default: @@ -142,14 +135,13 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen BorderSize CLScaleKernel::border_size() const { - return BorderSize(1); + return BorderSize(static_cast(_data_layout == DataLayout::NCHW)); } Status CLScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info) { - BorderSize border = BorderSize(1); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info)); + BorderSize border = BorderSize(static_cast(input->data_layout() == DataLayout::NCHW)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), info, border).first); return Status{}; @@ -173,6 +165,7 @@ void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, const S void CLScaleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info) { ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), info)); + auto padding_info = get_padding_info({ input, output }); _input = input; _output = output; @@ -208,6 +201,7 @@ void CLScaleKernel::configure(const CLCompileContext &compile_context, const ICL // Create kernel CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, input->info()->data_type())); build_opts.add_option("-DBORDER_SIZE=" + support::cpp11::to_string(border.right)); build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE"); build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); @@ -219,7 +213,6 @@ void CLScaleKernel::configure(const CLCompileContext &compile_context, const ICL build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale)); build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset)); } - std::string interpolation_name = string_from_interpolation_policy(interpolation_policy_to_use); std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower); std::string kernel_name = "scale_" + interpolation_name; @@ -250,13 +243,16 @@ void CLScaleKernel::configure(const CLCompileContext &compile_context, const ICL _config_id += support::cpp11::to_string(output->info()->dimension(2)); _config_id += "_"; _config_id += support::cpp11::to_string(output->info()->dimension(3)); + if(is_nhwc) + { + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); + } } void CLScaleKernel::run(const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - switch(_data_layout) { case DataLayout::NCHW: -- cgit v1.2.1