From 511771fbe0a74e6d9dfd37ba9b4926a8315ec7aa Mon Sep 17 00:00:00 2001 From: Giorgio Arena Date: Thu, 19 Aug 2021 13:04:56 +0100 Subject: Remove padding from ClScaleKernel - Merge quantized kernels with fp for bilinear interpolation (both NCHW and NHWC) - Pass dimensions at compile time rather than at run time - Use tile-based approach to rework the NCHW kernels - Remove unused functions/files Resolve COMPMID-4723 Signed-off-by: Giorgio Arena Change-Id: Ifcdf02beb9daa9f318395751b3c85eb2fe874082 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6138 Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Reviewed-by: Michele Di Giorgio --- SConscript | 3 - src/core/CL/cl_kernels/common/elementwise_unary.cl | 1 - src/core/CL/cl_kernels/nchw/scale.cl | 183 +++++++++++++++++---- src/core/CL/cl_kernels/nchw/scale_quantized.cl | 86 ---------- src/core/CL/cl_kernels/nhwc/scale.cl | 109 ++++++------ src/core/CL/cl_kernels/nhwc/scale_quantized.cl | 124 -------------- src/core/CL/cl_kernels/warp_helpers.h | 6 - src/core/CL/cl_kernels/warp_helpers_quantized.h | 136 --------------- src/core/gpu/cl/ClKernelLibrary.cpp | 10 -- src/core/gpu/cl/kernels/ClScaleKernel.cpp | 130 ++++----------- src/core/gpu/cl/kernels/ClScaleKernel.h | 1 - src/runtime/gpu/cl/operators/ClScale.cpp | 11 +- src/runtime/gpu/cl/operators/ClScale.h | 4 - tests/validation/CL/Scale.cpp | 12 +- 14 files changed, 246 insertions(+), 570 deletions(-) delete mode 100644 src/core/CL/cl_kernels/nchw/scale_quantized.cl delete mode 100644 src/core/CL/cl_kernels/nhwc/scale_quantized.cl delete mode 100644 src/core/CL/cl_kernels/warp_helpers_quantized.h diff --git a/SConscript b/SConscript index 2eed75a1a5..c384597752 100644 --- a/SConscript +++ b/SConscript @@ -241,7 +241,6 @@ if env['opencl'] and env['embed_kernels']: 'src/core/CL/cl_kernels/repeat.h', 'src/core/CL/cl_kernels/tile_helpers.h', 'src/core/CL/cl_kernels/types.h', - 'src/core/CL/cl_kernels/warp_helpers_quantized.h', 'src/core/CL/cl_kernels/warp_helpers.h' ] @@ -328,7 +327,6 @@ if env['opencl'] and env['embed_kernels']: 'src/core/CL/cl_kernels/nchw/remap.cl', 'src/core/CL/cl_kernels/nchw/reorg_layer.cl', 'src/core/CL/cl_kernels/nchw/scale.cl', - 'src/core/CL/cl_kernels/nchw/scale_quantized.cl', 'src/core/CL/cl_kernels/nchw/space_to_batch.cl', 'src/core/CL/cl_kernels/nchw/space_to_depth.cl', 'src/core/CL/cl_kernels/nchw/upsample_layer.cl', @@ -355,7 +353,6 @@ if env['opencl'] and env['embed_kernels']: 'src/core/CL/cl_kernels/nhwc/remap.cl', 'src/core/CL/cl_kernels/nhwc/reorg_layer.cl', 'src/core/CL/cl_kernels/nhwc/scale.cl', - 'src/core/CL/cl_kernels/nhwc/scale_quantized.cl', 'src/core/CL/cl_kernels/nhwc/space_to_batch.cl', 'src/core/CL/cl_kernels/nhwc/space_to_depth.cl', 'src/core/CL/cl_kernels/nhwc/upsample_layer.cl', diff --git a/src/core/CL/cl_kernels/common/elementwise_unary.cl b/src/core/CL/cl_kernels/common/elementwise_unary.cl index d2d9d97d33..eba2dbc866 100644 --- a/src/core/CL/cl_kernels/common/elementwise_unary.cl +++ b/src/core/CL/cl_kernels/common/elementwise_unary.cl @@ -22,7 +22,6 @@ * SOFTWARE. */ #include "helpers.h" -#include "warp_helpers.h" #if defined(DATA_TYPE) && defined(OPERATION) diff --git a/src/core/CL/cl_kernels/nchw/scale.cl b/src/core/CL/cl_kernels/nchw/scale.cl index 63a53cc4f2..2b4d6be9fb 100644 --- a/src/core/CL/cl_kernels/nchw/scale.cl +++ b/src/core/CL/cl_kernels/nchw/scale.cl @@ -22,7 +22,7 @@ * SOFTWARE. */ #include "helpers.h" -#include "warp_helpers.h" +#include "tile_helpers.h" /** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates. * @@ -87,28 +87,55 @@ inline const float8 transform_bilinear(const float2 coord, const float2 scale) * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] input_width Input image width - * @param[in] input_height Input image height - * @param[in] scale_x The scale factor along x dimension - * @param[in] scale_y The scale factor along y dimension */ __kernel void scale_nearest_neighbour_nchw( IMAGE_DECLARATION(in), - IMAGE_DECLARATION(out), - const float input_width, - const float input_height, - const float scale_x, - const float scale_y) + IMAGE_DECLARATION(out)) { - Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); - Image out = CONVERT_TO_IMAGE_STRUCT(out); - const float2 r = (float2)(scale_x, scale_y); - float8 transformed = transform_nearest(get_current_coords(), r); + const int x = get_global_id(0); + const int y = get_global_id(1); + + float8 transformed = transform_nearest((float2)(x * VEC_SIZE, y), (float2)(SCALE_X, SCALE_Y)); #ifdef ALIGN_CORNERS transformed = round(transformed); #endif // ALIGN_CORNERS - const float8 tc = clamp_to_border_with_size(transformed, input_width, input_height, BORDER_SIZE); - vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr); + + TILE(SELECT_DATA_TYPE(DATA_TYPE), 1, 4, cond); + cond[0].v = CONVERT(((transformed.even < 0) || (transformed.even >= (int)SRC_WIDTH)) || ((transformed.odd < 0) || (transformed.odd >= (int)SRC_HEIGHT)), SELECT_VEC_DATA_TYPE(DATA_TYPE, 4)); + + TILE(int, 1, 4, in_x); + TILE(int, 1, 4, in_y); + in_x[0].v = convert_int4(clamp(transformed.even, 0.f, SRC_WIDTH - 1.f)); + in_y[0].v = convert_int4(clamp(transformed.odd, 0.f, SRC_HEIGHT - 1.f)); + + TILE(DATA_TYPE, 1, VEC_SIZE, out_vals); + LOOP_UNROLLING(int, i, 0, 1, VEC_SIZE, + { + out_vals[0].s[i] = select(*((__global DATA_TYPE *)(in_ptr + in_offset_first_element_in_bytes + in_x[0].s[i] * sizeof(DATA_TYPE) + in_y[0].s[i] * in_stride_y)), (DATA_TYPE)CONSTANT_VALUE, cond[0].s[i]); + }) + + __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_step_x + y * out_stride_y; + + if(x == get_global_size(0) - 1) + { +#if VEC_SIZE == 1 + VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER) + (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr); +#else // VEC_SIZE == 1 + VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER) + (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr); +#endif // VEC_SIZE == 1 + } + else + { +#if VEC_SIZE == 1 + VSTORE(VEC_SIZE) + (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr); +#else // VEC_SIZE == 1 + VSTORE(VEC_SIZE) + (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr); +#endif // VEC_SIZE == 1 + } } /** Performs an affine transformation on an image interpolating with the BILINEAR method. @@ -127,22 +154,118 @@ __kernel void scale_nearest_neighbour_nchw( * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] input_width Input image width - * @param[in] input_height Input image height - * @param[in] scale_x The scale factor along x dimension - * @param[in] scale_y The scale factor along y dimension */ __kernel void scale_bilinear_nchw( IMAGE_DECLARATION(in), - IMAGE_DECLARATION(out), - const float input_width, - const float input_height, - const float scale_x, - const float scale_y) + IMAGE_DECLARATION(out)) { - Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); - Image out = CONVERT_TO_IMAGE_STRUCT(out); - const float2 r = (float2)(scale_x, scale_y); - const float8 tc = transform_bilinear(get_current_coords(), r); - vstore4(bilinear_interpolate_with_border(&in, tc, input_width, input_height, BORDER_SIZE), 0, (__global DATA_TYPE *)out.ptr); + const int x = get_global_id(0); + const int y = get_global_id(1); + + TILE(float, 1, 8, trans_coords); + TILE(float, 1, 8, floor_coords); + TILE(int, 1, 16, in_x); + TILE(int, 1, 16, in_y); + + trans_coords[0].v = transform_bilinear((float2)(x * VEC_SIZE, y), (float2)(SCALE_X, SCALE_Y)); + floor_coords[0].v = floor(trans_coords[0].v); + + LOOP_UNROLLING(int, i, 0, 1, 4, + { + LOOP_UNROLLING(int, j, 0, 1, 4, + { + in_x[0].s[i * 4 + j] = floor_coords[0].s[i * 2 + 0] + (j % 2); + in_y[0].s[i * 4 + j] = floor_coords[0].s[i * 2 + 1] + (j > 1); + }) + }) + +#if defined(BORDER_MODE_CONSTANT) + TILE(SELECT_DATA_TYPE(DATA_TYPE), 1, 16, cond); + cond[0].v = CONVERT(((in_x[0].v < 0) || (in_x[0].v >= (int)SRC_WIDTH)) || ((in_y[0].v < 0) || (in_y[0].v >= (int)SRC_HEIGHT)), SELECT_VEC_DATA_TYPE(DATA_TYPE, 16)); +#endif // defined(BORDER_MODE_CONSTANT) + + in_x[0].v = clamp(in_x[0].v, 0, (int16)((int)SRC_WIDTH - 1)); + in_y[0].v = clamp(in_y[0].v, 0, (int16)((int)SRC_HEIGHT - 1)); + + TILE(DATA_TYPE, 1, 16, in_vals); + + // Loads the values from the input image +#if defined(BORDER_MODE_CONSTANT) + LOOP_UNROLLING(int, i, 0, 1, 16, + { + in_vals[0].s[i] = select(*((__global DATA_TYPE *)(in_ptr + in_offset_first_element_in_bytes + in_x[0].s[i] * sizeof(DATA_TYPE) + in_y[0].s[i] * (int)in_stride_y)), (DATA_TYPE)CONSTANT_VALUE, cond[0].s[i]); + }) +#else // defined(BORDER_MODE_CONSTANT) + LOOP_UNROLLING(int, i, 0, 1, 16, + { + in_vals[0].s[i] = *((__global DATA_TYPE *)(in_ptr + in_offset_first_element_in_bytes + in_x[0].s[i] * sizeof(DATA_TYPE) + in_y[0].s[i] * (int)in_stride_y)); + }) +#endif // defined(BORDER_MODE_CONSTANT) + + TILE(float, 1, 8, a); + TILE(float, 1, 8, b); + + a[0].v = trans_coords[0].v - floor_coords[0].v; + b[0].v = ((float8)(1.f)) - a[0].v; + +#if defined(OFFSET) && defined(SCALE) + TILE(float, 1, 16, in_vals_f32); + TILE(float, 1, 4, out_vals_f32); + + in_vals_f32[0].v = convert_float16(convert_int16(in_vals[0].v) - (int16)OFFSET) * (float16)SCALE; + + // Bilinear interpolation: (in0 * b0 * b1) + (in1 * a0 * b1) + (in2 * b0 * a1) + (in3 * a0 * a1) + // (in4 * b2 * b3) + (in5 * a2 * b3) + (in6 * b2 * a3) + (in7 * a2 * a3) + // (in8 * b4 * b5) + (in9 * a4 * b5) + (in10 * b4 * a5) + (in11 * a4 * a5) + // (in12 * b6 * b7) + (in13 * a6 * b7) + (in14 * b6 * a7) + (in15 * a6 * a7) + LOOP_UNROLLING(int, i, 0, 1, 4, + { + out_vals_f32[0].s[i] = (in_vals_f32[0].s[i * 4 + 0] * b[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals_f32[0].s[i * 4 + 1] * a[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals_f32[0].s[i * 4 + 2] * b[0].s[i * 2] * a[0].s[i * 2 + 1]) + (in_vals_f32[0].s[i * 4 + 3] * a[0].s[i * 2] * a[0].s[i * 2 + 1]); + }) + + TILE(DATA_TYPE, 1, 4, out_vals_4); + TILE(DATA_TYPE, 1, VEC_SIZE, out_vals); + + out_vals_4[0].v = CONVERT_SAT(convert_int4_sat_rtp(out_vals_f32[0].v / (float)SCALE) + OFFSET, VEC_DATA_TYPE(DATA_TYPE, 4)); + + LOOP_UNROLLING(int, i, 0, 1, VEC_SIZE, + { + out_vals[0].s[i] = out_vals_4[0].s[i]; + }) +#else // defined(OFFSET) && defined(SCALE) + + TILE(DATA_TYPE, 1, VEC_SIZE, out_vals); + + // Bilinear interpolation: (in0 * b0 * b1) + (in1 * a0 * b1) + (in2 * b0 * a1) + (in3 * a0 * a1) + // (in4 * b2 * b3) + (in5 * a2 * b3) + (in6 * b2 * a3) + (in7 * a2 * a3) + // (in8 * b4 * b5) + (in9 * a4 * b5) + (in10 * b4 * a5) + (in11 * a4 * a5) + // (in12 * b6 * b7) + (in13 * a6 * b7) + (in14 * b6 * a7) + (in15 * a6 * a7) + LOOP_UNROLLING(int, i, 0, 1, VEC_SIZE, + { + out_vals[0].s[i] = (in_vals[0].s[i * 4 + 0] * b[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals[0].s[i * 4 + 1] * a[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals[0].s[i * 4 + 2] * b[0].s[i * 2] * a[0].s[i * 2 + 1]) + (in_vals[0].s[i * 4 + 3] * a[0].s[i * 2] * a[0].s[i * 2 + 1]); + }) +#endif // defined(OFFSET) && defined(SCALE) + + __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_step_x + y * out_stride_y; + + if(x == get_global_size(0) - 1) + { +#if VEC_SIZE == 1 + VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER) + (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr); +#else // VEC_SIZE == 1 + VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER) + (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr); +#endif // VEC_SIZE == 1 + } + else + { +#if VEC_SIZE == 1 + VSTORE(VEC_SIZE) + (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr); +#else // VEC_SIZE == 1 + VSTORE(VEC_SIZE) + (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr); +#endif // VEC_SIZE == 1 + } } \ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/scale_quantized.cl b/src/core/CL/cl_kernels/nchw/scale_quantized.cl deleted file mode 100644 index 946ad65c14..0000000000 --- a/src/core/CL/cl_kernels/nchw/scale_quantized.cl +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers_asymm.h" -#include "warp_helpers_quantized.h" - -/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates. - * - * @param[in] coord 2D coordinates to transform. - * @param[in] scale input/output scale ratio - * - * @return a float8 containing 4 2D transformed values in the input image. - */ -inline const float8 transform_bilinear_quantized(const float2 coord, const float2 scale) -{ - const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); -#ifdef SAMPLING_POLICY_TOP_LEFT - const float4 new_x = in_x_coords * (float4)(scale.s0); - const float4 new_y = (float4)(coord.s1 * scale.s1); - return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); -#elif SAMPLING_POLICY_CENTER - const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f); - const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f); - return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); -#else /* SAMPLING_POLICY */ -#error("Unsupported sampling policy"); -#endif /* SAMPLING_POLICY */ -} - -/** Performs an affine transformation on an image interpolating with the BILINEAR method. - * - * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT - * @note Scale value for QASYMM8 data type to used is passed as -DSCALE= e.g. -DSCALE=0.5 - * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET= e.g. -DOFFSET=1 - * - * @param[in] in_ptr Pointer to the source image. Supported data types: QASYMM8. - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input) - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] input_width Input image width - * @param[in] input_height Input image height - * @param[in] scale_x The scale factor along x dimension - * @param[in] scale_y The scale factor along y dimension - */ -__kernel void scale_bilinear_quantized_nchw( - IMAGE_DECLARATION(in), - IMAGE_DECLARATION(out), - const float input_width, - const float input_height, - const float scale_x, - const float scale_y) -{ - Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); - Image out = CONVERT_TO_IMAGE_STRUCT(out); - const float2 r = (float2)(scale_x, scale_y); - const float8 tc = transform_bilinear_quantized(get_current_coords_quantized(), r); - vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr); -} \ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl index 1ea5e73df1..69cbbcd5f3 100644 --- a/src/core/CL/cl_kernels/nhwc/scale.cl +++ b/src/core/CL/cl_kernels/nhwc/scale.cl @@ -22,7 +22,6 @@ * SOFTWARE. */ #include "helpers.h" -#include "warp_helpers.h" #if defined(DEPTH_OUT) /** Performs scale on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel F32. (NHWC) @@ -46,28 +45,20 @@ * @param[in] out_stride_z Stride of the destination image in Z dimension (in bytes) * @param[in] out_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] input_width Input image width - * @param[in] input_height Input image height - * @param[in] scale_x The scale factor along x dimension - * @param[in] scale_y The scale factor along y dimension */ __kernel void scale_nearest_neighbour_nhwc( TENSOR4D_DECLARATION(in), - TENSOR4D_DECLARATION(out), - const float input_width, - const float input_height, - const float scale_x, - const float scale_y) + TENSOR4D_DECLARATION(out)) { Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0); Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT); #ifdef SAMPLING_POLICY_TOP_LEFT - float new_x = get_global_id(1) * scale_x; - float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y; + float new_x = get_global_id(1) * SCALE_X; + float new_y = (get_global_id(2) % DEPTH_OUT) * SCALE_Y; #elif SAMPLING_POLICY_CENTER - float new_x = (get_global_id(1) + 0.5f) * scale_x; - float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y; + float new_x = (get_global_id(1) + 0.5f) * SCALE_X; + float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * SCALE_Y; #else /* SAMPLING_POLICY */ #error("Unsupported sampling policy"); #endif /* SAMPLING_POLICY */ @@ -75,8 +66,8 @@ __kernel void scale_nearest_neighbour_nhwc( new_x = round(new_x); new_y = round(new_y); #endif /* ALIGN_CORNERS */ - const float clamped_x = clamp(new_x, 0.0f, input_width - 1); - const float clamped_y = clamp(new_y, 0.0f, input_height - 1); + const float clamped_x = clamp(new_x, 0.0f, (float)SRC_WIDTH - 1); + const float clamped_y = clamp(new_y, 0.0f, (float)SRC_HEIGHT - 1); *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))); } @@ -104,71 +95,81 @@ __kernel void scale_nearest_neighbour_nhwc( * @param[in] out_stride_z Stride of the destination image in Z dimension (in bytes) * @param[in] out_step_z dst_stride_y * number of elements along Z processed per workitem(in bytes) * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] input_width Input image width - * @param[in] input_height Input image height - * @param[in] scale_x The scale factor along x dimension - * @param[in] scale_y The scale factor along y dimension * */ __kernel void scale_bilinear_nhwc( TENSOR4D_DECLARATION(in), - TENSOR4D_DECLARATION(out), - const float input_width, - const float input_height, - const float scale_x, - const float scale_y) + TENSOR4D_DECLARATION(out)) { Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0); Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT); #ifdef SAMPLING_POLICY_TOP_LEFT - const float new_x = get_global_id(1) * scale_x; - const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y; + const float new_x = get_global_id(1) * SCALE_X; + const float new_y = (get_global_id(2) % DEPTH_OUT) * SCALE_Y; #elif SAMPLING_POLICY_CENTER - const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f; - const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f; + const float new_x = (get_global_id(1) + 0.5f) * SCALE_X - 0.5f; + const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * SCALE_Y - 0.5f; #else /* SAMPLING_POLICY */ #error("Unsupported sampling policy"); #endif /* SAMPLING_POLICY */ const float new_xf = floor(new_x); const float new_yf = floor(new_y); - const float clamped_x = clamp(new_xf, 0.0f, input_width - 1); - const float clamped_x1 = clamp(new_xf + 1, 0.0f, input_width - 1); - const float clamped_y = clamp(new_yf, 0.0f, input_height - 1); - const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1); + const float clamped_x = clamp(new_xf, 0.0f, SRC_WIDTH - 1.f); + const float clamped_x1 = clamp(new_xf + 1, 0.0f, SRC_WIDTH - 1.f); + const float clamped_y = clamp(new_yf, 0.0f, SRC_HEIGHT - 1.f); + const float clamped_y1 = clamp(new_yf + 1, 0.0f, SRC_HEIGHT - 1.f); + +#if defined(OFFSET) && defined(SCALE) +#define IN_DATA_TYPE int +#else // defined(OFFSET) && defined(SCALE) +#define IN_DATA_TYPE float +#endif // defined(OFFSET) && defined(SCALE) #ifndef BORDER_MODE_REPLICATE - const bool check_x = (0.f <= new_xf && new_xf < input_width); - const bool check_x1 = (-1.f <= new_xf && new_xf < input_width - 1); - const bool check_y = (0.f <= new_yf && new_yf < input_height); - const bool check_y1 = (-1.f <= new_yf && new_yf < input_height - 1); - const float ins_0 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), - (get_global_id(2) / DEPTH_OUT)))), - check_x && check_y); - const float ins_1 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), - (get_global_id(2) / DEPTH_OUT)))), - check_x1 && check_y); - const float ins_2 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), - (get_global_id(2) / DEPTH_OUT)))), - check_x && check_y1); - const float ins_3 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), - (get_global_id(2) / DEPTH_OUT)))), - check_x1 && check_y1); - float4 ins = (float4)(ins_0, ins_1, ins_2, ins_3); + const bool check_x = (0.f <= new_xf && new_xf < (float)SRC_WIDTH); + const bool check_x1 = (-1.f <= new_xf && new_xf < SRC_WIDTH - 1.f); + const bool check_y = (0.f <= new_yf && new_yf < (float)SRC_HEIGHT); + const bool check_y1 = (-1.f <= new_yf && new_yf < SRC_HEIGHT - 1.f); + + const IN_DATA_TYPE ins_0 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), + (get_global_id(2) / DEPTH_OUT)))), + check_x && check_y); + const IN_DATA_TYPE ins_1 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), + (get_global_id(2) / DEPTH_OUT)))), + check_x1 && check_y); + const IN_DATA_TYPE ins_2 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), + (get_global_id(2) / DEPTH_OUT)))), + check_x && check_y1); + const IN_DATA_TYPE ins_3 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), + (get_global_id(2) / DEPTH_OUT)))), + check_x1 && check_y1); + VEC_DATA_TYPE(IN_DATA_TYPE, 4) + ins = (VEC_DATA_TYPE(IN_DATA_TYPE, 4))(ins_0, ins_1, ins_2, ins_3); #else /* BORDER_MODE_REPLICATE */ - float4 ins = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), - *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), - *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))), - *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT)))); + VEC_DATA_TYPE(IN_DATA_TYPE, 4) + ins = (VEC_DATA_TYPE(IN_DATA_TYPE, 4))(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))), + *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT)))); #endif /* BORDER_MODE_REPLICATE */ const float a = new_x - new_xf; const float b = 1.f - a; const float a1 = new_y - new_yf; const float b1 = 1.f - a1; + +#if defined(OFFSET) && defined(SCALE) + const float4 insf32 = convert_float4(ins - (int4)OFFSET) * (float4)SCALE; + const float fr = ((insf32.s0 * b * b1) + (insf32.s1 * a * b1) + (insf32.s2 * b * a1) + (insf32.s3 * a * a1)); + DATA_TYPE res = CONVERT_SAT(convert_int_sat_rtp(fr / SCALE) + OFFSET, DATA_TYPE); + + *((__global DATA_TYPE *)out.ptr) = res; +#else // defined(OFFSET) && defined(SCALE) const float fr = ((ins.s0 * b * b1) + (ins.s1 * a * b1) + (ins.s2 * b * a1) + (ins.s3 * a * a1)); *((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE); +#endif // defined(OFFSET) && defined(SCALE) } #endif /* defined(DEPTH_OUT) */ \ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/scale_quantized.cl b/src/core/CL/cl_kernels/nhwc/scale_quantized.cl deleted file mode 100644 index de9bb607b0..0000000000 --- a/src/core/CL/cl_kernels/nhwc/scale_quantized.cl +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers_asymm.h" -#include "warp_helpers_quantized.h" - -#if defined(DEPTH_OUT) -/** Performs scale on an image interpolating with the BILINEAR method. (NHWC) - * - * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT - * @note Scale value for QASYMM8 data type to used is passed as -DSCALE= e.g. -DSCALE=0.5 - * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET= e.g. -DOFFSET=1 - * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE - * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16 - * @note The value to be used at the edges of the images shoud be given as a preprocessor argument using -DCONSTANT_VALUE=value. - * - * @param[in] in_ptr Pointer to the source image. Supported data types: QASYMM8. - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in_stride_z Stride of the source image in Z dimension (in bytes) - * @param[in] in_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in_ptr - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_stride_z Stride of the destination image in Z dimension (in bytes) - * @param[in] out_step_z dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] input_width Input image width - * @param[in] input_height Input image height - * @param[in] scale_x The scale factor along x dimension - * @param[in] scale_y The scale factor along y dimension - * @param[in] constant_border_value Constant border value to use - */ -__kernel void scale_bilinear_quantized_nhwc( - TENSOR4D_DECLARATION(in), - TENSOR4D_DECLARATION(out), - const float input_width, - const float input_height, - const float scale_x, - const float scale_y) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT); - -#ifdef SAMPLING_POLICY_TOP_LEFT - const float new_x = get_global_id(1) * scale_x; - const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y; -#elif SAMPLING_POLICY_CENTER - const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f; - const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f; -#else /* SAMPLING_POLICY */ -#error("Unsupported sampling policy"); -#endif /* SAMPLING_POLICY */ - - const float new_xf = floor(new_x); - const float new_yf = floor(new_y); - const float clamped_x = clamp(new_xf, 0.0f, input_width - 1); - const float clamped_x1 = clamp(new_xf + 1, 0.0f, input_width - 1); - const float clamped_y = clamp(new_yf, 0.0f, input_height - 1); - const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1); - -#ifndef BORDER_MODE_REPLICATE - const bool check_x = (0.f <= new_xf && new_xf < input_width); - const bool check_x1 = (-1.f <= new_xf && new_xf < input_width - 1); - const bool check_y = (0.f <= new_yf && new_yf < input_height); - const bool check_y1 = (-1.f <= new_yf && new_yf < input_height - 1); - const int ins_0 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), - (get_global_id(2) / DEPTH_OUT)))), - check_x && check_y); - const int ins_1 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), - (get_global_id(2) / DEPTH_OUT)))), - check_x1 && check_y); - const int ins_2 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), - (get_global_id(2) / DEPTH_OUT)))), - check_x && check_y1); - const int ins_3 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), - (get_global_id(2) / DEPTH_OUT)))), - check_x1 && check_y1); - int4 ins = (int4)(ins_0, ins_1, ins_2, ins_3); -#else /* BORDER_MODE_REPLICATE */ - int4 ins = (int4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), - *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), - *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))), - *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT)))); -#endif /* BORDER_MODE_REPLICATE */ - - const float a = new_x - new_xf; - const float b = 1.f - a; - const float a1 = new_y - new_yf; - const float b1 = 1.f - a1; - const float4 insf32 = convert_float4(ins - (int4)OFFSET) * (float4)SCALE; - - const float fr = ((insf32.s0 * b * b1) + (insf32.s1 * a * b1) + (insf32.s2 * b * a1) + (insf32.s3 * a * a1)); - - DATA_TYPE res = CONVERT_SAT(convert_int_sat_rtp(fr / SCALE) + OFFSET, DATA_TYPE); - - *((__global DATA_TYPE *)out.ptr) = res; -} -#endif /* defined(DEPTH_OUT) */ \ No newline at end of file diff --git a/src/core/CL/cl_kernels/warp_helpers.h b/src/core/CL/cl_kernels/warp_helpers.h index 005861ddfa..642483ab3c 100644 --- a/src/core/CL/cl_kernels/warp_helpers.h +++ b/src/core/CL/cl_kernels/warp_helpers.h @@ -63,12 +63,6 @@ inline const VEC_DATA_TYPE(DATA_TYPE, 4) read_texels4(const Image *in, const int *((__global DATA_TYPE *)offset(in, coords.s6, coords.s7))); } -/** Returns the current thread coordinates. */ -inline const float2 get_current_coords() -{ - return (float2)(get_global_id(0) * 4, get_global_id(1)); -} - /** Given a texel coordinates this function will return the following array of coordinates: * [ P, right neighbour, below neighbour, below right neighbour ] * diff --git a/src/core/CL/cl_kernels/warp_helpers_quantized.h b/src/core/CL/cl_kernels/warp_helpers_quantized.h deleted file mode 100644 index b10890aff0..0000000000 --- a/src/core/CL/cl_kernels/warp_helpers_quantized.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2018-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers_asymm.h" - -/** Clamps the given coordinates to the borders according to the border size. - * - * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords. - * @param[in] width Width of the image - * @param[in] height Height of the image - * @param[in] border_size Border size of the image - * - */ -inline const float8 clamp_to_border_with_size_quantized(float8 coords, const float width, const float height, const float border_size) -{ - const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size); - const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size); - return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3); -} - -/** Clamps the given coordinates to the borders. - * - * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords. - * @param[in] width Width of the image - * @param[in] height Height of the image - * - */ -inline const float8 clamp_to_border_quantized(float8 coords, const float width, const float height) -{ - return clamp_to_border_with_size_quantized(coords, width, height, 1); -} - -/** Given a texel coordinates this function will return the following array of coordinates: - * [ P, right neighbour, below neighbour, below right neighbour ] - * - * @note No checks to see if the coordinates are out of the image are done here. - * - * @param[in] coord Input coordinates - * - * @return vector of 8 floats with the coordinates, even positions are x and odd y. - */ -inline const float8 get_neighbour_coords_quantized(const float2 coord) -{ - return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1); -} - -/** Returns the current thread coordinates. */ -inline const float2 get_current_coords_quantized() -{ - return (float2)(get_global_id(0) * 4, get_global_id(1)); -} - -/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values - * - * @param[in] in Pointer to the source image. - * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y. - * @param[in] width Width of the image - * @param[in] height Height of the image - * @param[in] border_size Border size - * @param[in] scale Scale value - * @param[in] offset_qasymm Offset value - */ -inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border_quantized(const Image *in, const float8 coords, const float width, const float height, const float border_size, - const float scale, const int offset_qasymm) -{ - // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image. - - // Sets the 4x4 coordinates for each of the four input texels - const float8 fc = floor(coords); - const float16 c1 = (float16)( - clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s0, fc.s1)), width, height, border_size), - clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s2, fc.s3)), width, height, border_size)); - const float16 c2 = (float16)( - clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s4, fc.s5)), width, height, border_size), - clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s6, fc.s7)), width, height, border_size)); - - // Loads the values from the input image - const int16 t = (int16)( - /* tl, tr, bl, br */ - * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)), - *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)), - *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)), - *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)), - *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)), - *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)), - *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)), - *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf))); - - const float16 inf32 = convert_float16(t - (int16)offset_qasymm) * (float16)scale; - - const float8 a = coords - fc; - const float8 b = ((float8)(1.f)) - a; - const float4 fr = (float4)( - ((inf32.s0 * b.s0 * b.s1) + (inf32.s1 * a.s0 * b.s1) + (inf32.s2 * b.s0 * a.s1) + (inf32.s3 * a.s0 * a.s1)), - ((inf32.s4 * b.s2 * b.s3) + (inf32.s5 * a.s2 * b.s3) + (inf32.s6 * b.s2 * a.s3) + (inf32.s7 * a.s2 * a.s3)), - ((inf32.s8 * b.s4 * b.s5) + (inf32.s9 * a.s4 * b.s5) + (inf32.sa * b.s4 * a.s5) + (inf32.sb * a.s4 * a.s5)), - ((inf32.sc * b.s6 * b.s7) + (inf32.sd * a.s6 * b.s7) + (inf32.se * b.s6 * a.s7) + (inf32.sf * a.s6 * a.s7))); - - const VEC_DATA_TYPE(DATA_TYPE, 4) res = CONVERT_SAT(convert_int4_sat_rtp(fr / scale) + offset_qasymm, VEC_DATA_TYPE(DATA_TYPE, 4)); - - return res; -} - -/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values - * - * @param[in] in Pointer to the source image. - * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y. - * @param[in] width Width of the image - * @param[in] height Height of the image - * @param[in] scale Scale value - * @param[in] offset_qasymm Offset value - */ -inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_quantized(const Image *in, const float8 coords, const float width, const float height, const float scale, const int offset_qasymm) -{ - return bilinear_interpolate_with_border_quantized(in, coords, width, height, 1, scale, offset_qasymm); -} diff --git a/src/core/gpu/cl/ClKernelLibrary.cpp b/src/core/gpu/cl/ClKernelLibrary.cpp index 05beb5f63d..4a9ba874b1 100644 --- a/src/core/gpu/cl/ClKernelLibrary.cpp +++ b/src/core/gpu/cl/ClKernelLibrary.cpp @@ -394,7 +394,6 @@ const std::map ClKernelLibrary::_kernel_program_map = { "reorg_layer_nchw", "nchw/reorg_layer.cl" }, { "scale_nearest_neighbour_nchw", "nchw/scale.cl" }, { "scale_bilinear_nchw", "nchw/scale.cl" }, - { "scale_bilinear_quantized_nchw", "nchw/scale_quantized.cl" }, { "space_to_batch_nchw", "nchw/space_to_batch.cl" }, { "space_to_batch_static_nchw", "nchw/space_to_batch.cl" }, { "space_to_depth_nchw", "nchw/space_to_depth.cl" }, @@ -455,7 +454,6 @@ const std::map ClKernelLibrary::_kernel_program_map = { "reorg_layer_nhwc", "nhwc/reorg_layer.cl" }, { "scale_nearest_neighbour_nhwc", "nhwc/scale.cl" }, { "scale_bilinear_nhwc", "nhwc/scale.cl" }, - { "scale_bilinear_quantized_nhwc", "nhwc/scale_quantized.cl" }, { "space_to_batch_nhwc", "nhwc/space_to_batch.cl" }, { "space_to_batch_static_nhwc", "nhwc/space_to_batch.cl" }, { "space_to_depth_nhwc", "nhwc/space_to_depth.cl" }, @@ -825,10 +823,6 @@ const std::map ClKernelLibrary::_program_source_map = { "nchw/scale.cl", #include "./cl_kernels/nchw/scale.clembed" - }, - { - "nchw/scale_quantized.cl", -#include "./cl_kernels/nchw/scale_quantized.clembed" }, { "nchw/space_to_batch.cl", @@ -924,10 +918,6 @@ const std::map ClKernelLibrary::_program_source_map = { "nhwc/scale.cl", #include "./cl_kernels/nhwc/scale.clembed" - }, - { - "nhwc/scale_quantized.cl", -#include "./cl_kernels/nhwc/scale_quantized.clembed" }, { "nhwc/space_to_batch.cl", diff --git a/src/core/gpu/cl/kernels/ClScaleKernel.cpp b/src/core/gpu/cl/kernels/ClScaleKernel.cpp index 57ca331539..ee4ee22aa0 100644 --- a/src/core/gpu/cl/kernels/ClScaleKernel.cpp +++ b/src/core/gpu/cl/kernels/ClScaleKernel.cpp @@ -50,10 +50,10 @@ inline std::pair calculate_scale_factors(const ITensorInfo *src, c const unsigned int dst_width = dst->dimension(idx_width); const unsigned int dst_height = dst->dimension(idx_height); - float wr = arm_compute::scale_utils::calculate_resize_ratio(src_width, dst_width, align_corners); - float hr = arm_compute::scale_utils::calculate_resize_ratio(src_height, dst_height, align_corners); + float scale_x = arm_compute::scale_utils::calculate_resize_ratio(src_width, dst_width, align_corners); + float scale_y = arm_compute::scale_utils::calculate_resize_ratio(src_height, dst_height, align_corners); - return std::make_pair(wr, hr); + return std::make_pair(scale_x, scale_y); } Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info) @@ -65,78 +65,22 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(dst == src); ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy)); + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && !is_data_type_quantized_asymmetric(src->data_type())); - float wr = 0.f; - float hr = 0.f; + float scale_x = 0.f; + float scale_y = 0.f; const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; - std::tie(wr, hr) = calculate_scale_factors(src, dst, data_layout, info.align_corners); + std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, data_layout, info.align_corners); - ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA && (wr > 1.f || hr > 1.f)); + ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA && (scale_x > 1.f || scale_y > 1.f)); return Status{}; } - -std::pair validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info, BorderSize &border) -{ - Window win{}; - bool window_changed{}; - unsigned int num_elems_processed_per_iteration = 0; - const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; - - switch(data_layout) - { - case DataLayout::NCHW: - { - if(info.border_mode == BorderMode::UNDEFINED) - { - border = BorderSize(0); - } - - num_elems_processed_per_iteration = 4; - // Configure kernel window - win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration)); - AccessWindowStatic input_access(src, - -border.left, -border.top, - src->dimension(0) + border.right, - src->dimension(1) + border.bottom); - AccessWindowHorizontal output_access(dst, 0, num_elems_processed_per_iteration); - - output_access.set_valid_region(win, calculate_valid_region_scale(*src, - dst->tensor_shape(), - info.interpolation_policy, - info.sampling_policy, - info.border_mode == BorderMode::UNDEFINED)); - - window_changed = update_window_and_padding(win, input_access, output_access); - } - break; - case DataLayout::NHWC: - { - // Configure kernel window - win = calculate_max_window(*dst, Steps()); - } - break; - default: - ARM_COMPUTE_ERROR("Data layout not supported"); - } - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} } // namespace -BorderSize ClScaleKernel::border_size() const -{ - return BorderSize(static_cast(_data_layout == DataLayout::NCHW)); -} - Status ClScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, info)); - const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; - BorderSize border = BorderSize(static_cast(data_layout == DataLayout::NCHW)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), info, border).first); - return Status{}; } @@ -153,37 +97,45 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn // Info required for the static tuning _data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; - float wr = 0.f; - float hr = 0.f; - std::tie(wr, hr) = calculate_scale_factors(src, dst, _data_layout, info.align_corners); - const bool call_quantized_kernel = is_data_type_quantized_asymmetric(src->data_type()) && info.interpolation_policy == InterpolationPolicy::BILINEAR; - - // Compute actual border size - BorderSize border = border_size(); const bool is_nhwc = _data_layout == DataLayout::NHWC; + float scale_x = 0.f; + float scale_y = 0.f; + std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, _data_layout, info.align_corners); + const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(src->data_type()) && info.interpolation_policy == InterpolationPolicy::BILINEAR; + // Area interpolation behaves as Nearest Neighbour in case of up-sampling auto interpolation_policy_to_use = info.interpolation_policy; - if(info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) + if(info.interpolation_policy == InterpolationPolicy::AREA && scale_x <= 1.f && scale_y <= 1.f) { interpolation_policy_to_use = InterpolationPolicy::NEAREST_NEIGHBOR; } - // Configure kernel window - auto win_config = validate_and_configure_window(src, dst, info, border); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - ICLKernel::configure_internal(win_config.second); - // Create kernel + const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const unsigned int src_width = src->dimension(idx_width); + const unsigned int src_height = src->dimension(idx_height); + const unsigned int dst_width = dst->dimension(idx_width); + const unsigned int vec_size = adjust_vec_size(is_nhwc ? 1 : 4, dst_width); + const unsigned int vec_size_leftover = (dst_width % vec_size); + CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type())); - build_opts.add_option("-DBORDER_SIZE=" + support::cpp11::to_string(border.right)); + build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width)); + build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height)); + build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x)); + build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y)); + build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE"); + build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT"); + build_opts.add_option_if(!is_nhwc, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size)); + build_opts.add_option_if(!is_nhwc, "-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0) ? support::cpp11::to_string(vec_size) : support::cpp11::to_string(vec_size_leftover))); build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(dst->dimension(2))); build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT"); build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS"); - if(call_quantized_kernel) + if(is_qasymm_bilinear) { const UniformQuantizationInfo qinfo = src->quantization_info().uniform(); build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale)); @@ -191,26 +143,16 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn } std::string interpolation_name = string_from_interpolation_policy(interpolation_policy_to_use); std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower); - std::string kernel_name = "scale_" + interpolation_name; - kernel_name += call_quantized_kernel ? "_quantized_" : "_"; + std::string kernel_name = "scale_" + interpolation_name + "_"; kernel_name += lower_string(string_from_data_layout(_data_layout)); _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - if(is_nhwc) - { - ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); - } - const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - unsigned int idx = is_nhwc ? 2 * num_arguments_per_4D_tensor() : 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters - const unsigned int src_width = src->dimension(idx_width); - const unsigned int dst_height = src->dimension(idx_height); + // Configure kernel window + Window win = calculate_max_window(*dst, Steps(vec_size)); + ICLKernel::configure_internal(win); - _kernel.setArg(idx++, src_width); - _kernel.setArg(idx++, dst_height); - _kernel.setArg(idx++, wr); - _kernel.setArg(idx++, hr); + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); // Set config_id for enabling LWS tuning _config_id = "scale_"; diff --git a/src/core/gpu/cl/kernels/ClScaleKernel.h b/src/core/gpu/cl/kernels/ClScaleKernel.h index 8333c7d6c0..6674931296 100644 --- a/src/core/gpu/cl/kernels/ClScaleKernel.h +++ b/src/core/gpu/cl/kernels/ClScaleKernel.h @@ -59,7 +59,6 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info); // Inherited methods overridden: - BorderSize border_size() const override; void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: diff --git a/src/runtime/gpu/cl/operators/ClScale.cpp b/src/runtime/gpu/cl/operators/ClScale.cpp index 4730c8a16e..5c8d754c7e 100644 --- a/src/runtime/gpu/cl/operators/ClScale.cpp +++ b/src/runtime/gpu/cl/operators/ClScale.cpp @@ -41,12 +41,7 @@ void ClScale::configure(const CLCompileContext &compile_context, ITensorInfo *sr k->set_target(CLScheduler::get().target()); k->configure(compile_context, src, dst, info); _kernel = std::move(k); - if(!_kernel->border_size().empty()) - { - auto b = std::make_unique(); - b->configure(compile_context, src, _kernel->border_size(), info.border_mode, info.constant_border_value); - _border_handler = std::move(b); - } + // Tune kernel CLScheduler::get().tune_kernel_static(*_kernel); } @@ -59,10 +54,6 @@ Status ClScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const S void ClScale::run(ITensorPack &tensors) { ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); - if(!_kernel->border_size().empty()) - { - CLScheduler::get().enqueue_op(*_border_handler.get(), tensors, false); - } CLScheduler::get().enqueue_op(*_kernel.get(), tensors); } } // namespace opencl diff --git a/src/runtime/gpu/cl/operators/ClScale.h b/src/runtime/gpu/cl/operators/ClScale.h index 905c43a41c..0ff78640f7 100644 --- a/src/runtime/gpu/cl/operators/ClScale.h +++ b/src/runtime/gpu/cl/operators/ClScale.h @@ -34,7 +34,6 @@ namespace opencl { /** Basic function to simulate a scale layer. This function calls the following OpenCL kernels: * - * -# @ref CLFillBorderKernel (executed if padding size is different from zero) * -# @ref kernels::ClScaleKernel */ class ClScale : public IClOperator @@ -61,9 +60,6 @@ public: // Inherited method overridden void run(ITensorPack &tensors) override; - -protected: - std::unique_ptr _border_handler{ nullptr }; }; } // namespace opencl } // namespace arm_compute diff --git a/tests/validation/CL/Scale.cpp b/tests/validation/CL/Scale.cpp index 2b34f1f353..845408a0b2 100644 --- a/tests/validation/CL/Scale.cpp +++ b/tests/validation/CL/Scale.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -186,16 +186,6 @@ TEST_CASE(AlignedCornerNotSupported, framework::DatasetMode::ALL) ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS); } -TEST_CASE(WindowShrink, framework::DatasetMode::ALL) -{ - const auto input = TensorInfo{ TensorShape(37U, 37U, 2U), 1, DataType::F32 }; - const auto output = TensorInfo{ TensorShape(39U, 55U, 2U), 1, DataType::F32 }; - Status result{}; - - result = CLScale::validate(&input.clone()->set_is_resizable(false), &output.clone()->set_is_resizable(false), ScaleKernelInfo{ default_interpolation_policy, default_border_mode }); - ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS); -} - TEST_CASE(IncorrectScaleFactor, framework::DatasetMode::ALL) { const auto input = TensorInfo{ TensorShape(28U, 33U, 2U), 1, DataType::F32 }; -- cgit v1.2.1