aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGiorgio Arena <giorgio.arena@arm.com>2021-08-19 13:04:56 +0100
committerGiorgio Arena <giorgio.arena@arm.com>2021-08-23 13:33:31 +0000
commit511771fbe0a74e6d9dfd37ba9b4926a8315ec7aa (patch)
treee2296560f37935232d482982d025e148f1c6c61e
parent19884630c37ae9de2f65a88ea2cda5630a551bad (diff)
downloadComputeLibrary-511771fbe0a74e6d9dfd37ba9b4926a8315ec7aa.tar.gz
Remove padding from ClScaleKernel
- Merge quantized kernels with fp for bilinear interpolation (both NCHW and NHWC) - Pass dimensions at compile time rather than at run time - Use tile-based approach to rework the NCHW kernels - Remove unused functions/files Resolve COMPMID-4723 Signed-off-by: Giorgio Arena <giorgio.arena@arm.com> Change-Id: Ifcdf02beb9daa9f318395751b3c85eb2fe874082 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6138 Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
-rw-r--r--SConscript3
-rw-r--r--src/core/CL/cl_kernels/common/elementwise_unary.cl1
-rw-r--r--src/core/CL/cl_kernels/nchw/scale.cl183
-rw-r--r--src/core/CL/cl_kernels/nchw/scale_quantized.cl86
-rw-r--r--src/core/CL/cl_kernels/nhwc/scale.cl109
-rw-r--r--src/core/CL/cl_kernels/nhwc/scale_quantized.cl124
-rw-r--r--src/core/CL/cl_kernels/warp_helpers.h6
-rw-r--r--src/core/CL/cl_kernels/warp_helpers_quantized.h136
-rw-r--r--src/core/gpu/cl/ClKernelLibrary.cpp10
-rw-r--r--src/core/gpu/cl/kernels/ClScaleKernel.cpp130
-rw-r--r--src/core/gpu/cl/kernels/ClScaleKernel.h1
-rw-r--r--src/runtime/gpu/cl/operators/ClScale.cpp11
-rw-r--r--src/runtime/gpu/cl/operators/ClScale.h4
-rw-r--r--tests/validation/CL/Scale.cpp12
14 files changed, 246 insertions, 570 deletions
diff --git a/SConscript b/SConscript
index 2eed75a1a5..c384597752 100644
--- a/SConscript
+++ b/SConscript
@@ -241,7 +241,6 @@ if env['opencl'] and env['embed_kernels']:
'src/core/CL/cl_kernels/repeat.h',
'src/core/CL/cl_kernels/tile_helpers.h',
'src/core/CL/cl_kernels/types.h',
- 'src/core/CL/cl_kernels/warp_helpers_quantized.h',
'src/core/CL/cl_kernels/warp_helpers.h'
]
@@ -328,7 +327,6 @@ if env['opencl'] and env['embed_kernels']:
'src/core/CL/cl_kernels/nchw/remap.cl',
'src/core/CL/cl_kernels/nchw/reorg_layer.cl',
'src/core/CL/cl_kernels/nchw/scale.cl',
- 'src/core/CL/cl_kernels/nchw/scale_quantized.cl',
'src/core/CL/cl_kernels/nchw/space_to_batch.cl',
'src/core/CL/cl_kernels/nchw/space_to_depth.cl',
'src/core/CL/cl_kernels/nchw/upsample_layer.cl',
@@ -355,7 +353,6 @@ if env['opencl'] and env['embed_kernels']:
'src/core/CL/cl_kernels/nhwc/remap.cl',
'src/core/CL/cl_kernels/nhwc/reorg_layer.cl',
'src/core/CL/cl_kernels/nhwc/scale.cl',
- 'src/core/CL/cl_kernels/nhwc/scale_quantized.cl',
'src/core/CL/cl_kernels/nhwc/space_to_batch.cl',
'src/core/CL/cl_kernels/nhwc/space_to_depth.cl',
'src/core/CL/cl_kernels/nhwc/upsample_layer.cl',
diff --git a/src/core/CL/cl_kernels/common/elementwise_unary.cl b/src/core/CL/cl_kernels/common/elementwise_unary.cl
index d2d9d97d33..eba2dbc866 100644
--- a/src/core/CL/cl_kernels/common/elementwise_unary.cl
+++ b/src/core/CL/cl_kernels/common/elementwise_unary.cl
@@ -22,7 +22,6 @@
* SOFTWARE.
*/
#include "helpers.h"
-#include "warp_helpers.h"
#if defined(DATA_TYPE) && defined(OPERATION)
diff --git a/src/core/CL/cl_kernels/nchw/scale.cl b/src/core/CL/cl_kernels/nchw/scale.cl
index 63a53cc4f2..2b4d6be9fb 100644
--- a/src/core/CL/cl_kernels/nchw/scale.cl
+++ b/src/core/CL/cl_kernels/nchw/scale.cl
@@ -22,7 +22,7 @@
* SOFTWARE.
*/
#include "helpers.h"
-#include "warp_helpers.h"
+#include "tile_helpers.h"
/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
*
@@ -87,28 +87,55 @@ inline const float8 transform_bilinear(const float2 coord, const float2 scale)
* @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
* @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] input_width Input image width
- * @param[in] input_height Input image height
- * @param[in] scale_x The scale factor along x dimension
- * @param[in] scale_y The scale factor along y dimension
*/
__kernel void scale_nearest_neighbour_nchw(
IMAGE_DECLARATION(in),
- IMAGE_DECLARATION(out),
- const float input_width,
- const float input_height,
- const float scale_x,
- const float scale_y)
+ IMAGE_DECLARATION(out))
{
- Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
- Image out = CONVERT_TO_IMAGE_STRUCT(out);
- const float2 r = (float2)(scale_x, scale_y);
- float8 transformed = transform_nearest(get_current_coords(), r);
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+
+ float8 transformed = transform_nearest((float2)(x * VEC_SIZE, y), (float2)(SCALE_X, SCALE_Y));
#ifdef ALIGN_CORNERS
transformed = round(transformed);
#endif // ALIGN_CORNERS
- const float8 tc = clamp_to_border_with_size(transformed, input_width, input_height, BORDER_SIZE);
- vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr);
+
+ TILE(SELECT_DATA_TYPE(DATA_TYPE), 1, 4, cond);
+ cond[0].v = CONVERT(((transformed.even < 0) || (transformed.even >= (int)SRC_WIDTH)) || ((transformed.odd < 0) || (transformed.odd >= (int)SRC_HEIGHT)), SELECT_VEC_DATA_TYPE(DATA_TYPE, 4));
+
+ TILE(int, 1, 4, in_x);
+ TILE(int, 1, 4, in_y);
+ in_x[0].v = convert_int4(clamp(transformed.even, 0.f, SRC_WIDTH - 1.f));
+ in_y[0].v = convert_int4(clamp(transformed.odd, 0.f, SRC_HEIGHT - 1.f));
+
+ TILE(DATA_TYPE, 1, VEC_SIZE, out_vals);
+ LOOP_UNROLLING(int, i, 0, 1, VEC_SIZE,
+ {
+ out_vals[0].s[i] = select(*((__global DATA_TYPE *)(in_ptr + in_offset_first_element_in_bytes + in_x[0].s[i] * sizeof(DATA_TYPE) + in_y[0].s[i] * in_stride_y)), (DATA_TYPE)CONSTANT_VALUE, cond[0].s[i]);
+ })
+
+ __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_step_x + y * out_stride_y;
+
+ if(x == get_global_size(0) - 1)
+ {
+#if VEC_SIZE == 1
+ VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER)
+ (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr);
+#else // VEC_SIZE == 1
+ VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER)
+ (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr);
+#endif // VEC_SIZE == 1
+ }
+ else
+ {
+#if VEC_SIZE == 1
+ VSTORE(VEC_SIZE)
+ (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr);
+#else // VEC_SIZE == 1
+ VSTORE(VEC_SIZE)
+ (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr);
+#endif // VEC_SIZE == 1
+ }
}
/** Performs an affine transformation on an image interpolating with the BILINEAR method.
@@ -127,22 +154,118 @@ __kernel void scale_nearest_neighbour_nchw(
* @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
* @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] input_width Input image width
- * @param[in] input_height Input image height
- * @param[in] scale_x The scale factor along x dimension
- * @param[in] scale_y The scale factor along y dimension
*/
__kernel void scale_bilinear_nchw(
IMAGE_DECLARATION(in),
- IMAGE_DECLARATION(out),
- const float input_width,
- const float input_height,
- const float scale_x,
- const float scale_y)
+ IMAGE_DECLARATION(out))
{
- Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
- Image out = CONVERT_TO_IMAGE_STRUCT(out);
- const float2 r = (float2)(scale_x, scale_y);
- const float8 tc = transform_bilinear(get_current_coords(), r);
- vstore4(bilinear_interpolate_with_border(&in, tc, input_width, input_height, BORDER_SIZE), 0, (__global DATA_TYPE *)out.ptr);
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+
+ TILE(float, 1, 8, trans_coords);
+ TILE(float, 1, 8, floor_coords);
+ TILE(int, 1, 16, in_x);
+ TILE(int, 1, 16, in_y);
+
+ trans_coords[0].v = transform_bilinear((float2)(x * VEC_SIZE, y), (float2)(SCALE_X, SCALE_Y));
+ floor_coords[0].v = floor(trans_coords[0].v);
+
+ LOOP_UNROLLING(int, i, 0, 1, 4,
+ {
+ LOOP_UNROLLING(int, j, 0, 1, 4,
+ {
+ in_x[0].s[i * 4 + j] = floor_coords[0].s[i * 2 + 0] + (j % 2);
+ in_y[0].s[i * 4 + j] = floor_coords[0].s[i * 2 + 1] + (j > 1);
+ })
+ })
+
+#if defined(BORDER_MODE_CONSTANT)
+ TILE(SELECT_DATA_TYPE(DATA_TYPE), 1, 16, cond);
+ cond[0].v = CONVERT(((in_x[0].v < 0) || (in_x[0].v >= (int)SRC_WIDTH)) || ((in_y[0].v < 0) || (in_y[0].v >= (int)SRC_HEIGHT)), SELECT_VEC_DATA_TYPE(DATA_TYPE, 16));
+#endif // defined(BORDER_MODE_CONSTANT)
+
+ in_x[0].v = clamp(in_x[0].v, 0, (int16)((int)SRC_WIDTH - 1));
+ in_y[0].v = clamp(in_y[0].v, 0, (int16)((int)SRC_HEIGHT - 1));
+
+ TILE(DATA_TYPE, 1, 16, in_vals);
+
+ // Loads the values from the input image
+#if defined(BORDER_MODE_CONSTANT)
+ LOOP_UNROLLING(int, i, 0, 1, 16,
+ {
+ in_vals[0].s[i] = select(*((__global DATA_TYPE *)(in_ptr + in_offset_first_element_in_bytes + in_x[0].s[i] * sizeof(DATA_TYPE) + in_y[0].s[i] * (int)in_stride_y)), (DATA_TYPE)CONSTANT_VALUE, cond[0].s[i]);
+ })
+#else // defined(BORDER_MODE_CONSTANT)
+ LOOP_UNROLLING(int, i, 0, 1, 16,
+ {
+ in_vals[0].s[i] = *((__global DATA_TYPE *)(in_ptr + in_offset_first_element_in_bytes + in_x[0].s[i] * sizeof(DATA_TYPE) + in_y[0].s[i] * (int)in_stride_y));
+ })
+#endif // defined(BORDER_MODE_CONSTANT)
+
+ TILE(float, 1, 8, a);
+ TILE(float, 1, 8, b);
+
+ a[0].v = trans_coords[0].v - floor_coords[0].v;
+ b[0].v = ((float8)(1.f)) - a[0].v;
+
+#if defined(OFFSET) && defined(SCALE)
+ TILE(float, 1, 16, in_vals_f32);
+ TILE(float, 1, 4, out_vals_f32);
+
+ in_vals_f32[0].v = convert_float16(convert_int16(in_vals[0].v) - (int16)OFFSET) * (float16)SCALE;
+
+ // Bilinear interpolation: (in0 * b0 * b1) + (in1 * a0 * b1) + (in2 * b0 * a1) + (in3 * a0 * a1)
+ // (in4 * b2 * b3) + (in5 * a2 * b3) + (in6 * b2 * a3) + (in7 * a2 * a3)
+ // (in8 * b4 * b5) + (in9 * a4 * b5) + (in10 * b4 * a5) + (in11 * a4 * a5)
+ // (in12 * b6 * b7) + (in13 * a6 * b7) + (in14 * b6 * a7) + (in15 * a6 * a7)
+ LOOP_UNROLLING(int, i, 0, 1, 4,
+ {
+ out_vals_f32[0].s[i] = (in_vals_f32[0].s[i * 4 + 0] * b[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals_f32[0].s[i * 4 + 1] * a[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals_f32[0].s[i * 4 + 2] * b[0].s[i * 2] * a[0].s[i * 2 + 1]) + (in_vals_f32[0].s[i * 4 + 3] * a[0].s[i * 2] * a[0].s[i * 2 + 1]);
+ })
+
+ TILE(DATA_TYPE, 1, 4, out_vals_4);
+ TILE(DATA_TYPE, 1, VEC_SIZE, out_vals);
+
+ out_vals_4[0].v = CONVERT_SAT(convert_int4_sat_rtp(out_vals_f32[0].v / (float)SCALE) + OFFSET, VEC_DATA_TYPE(DATA_TYPE, 4));
+
+ LOOP_UNROLLING(int, i, 0, 1, VEC_SIZE,
+ {
+ out_vals[0].s[i] = out_vals_4[0].s[i];
+ })
+#else // defined(OFFSET) && defined(SCALE)
+
+ TILE(DATA_TYPE, 1, VEC_SIZE, out_vals);
+
+ // Bilinear interpolation: (in0 * b0 * b1) + (in1 * a0 * b1) + (in2 * b0 * a1) + (in3 * a0 * a1)
+ // (in4 * b2 * b3) + (in5 * a2 * b3) + (in6 * b2 * a3) + (in7 * a2 * a3)
+ // (in8 * b4 * b5) + (in9 * a4 * b5) + (in10 * b4 * a5) + (in11 * a4 * a5)
+ // (in12 * b6 * b7) + (in13 * a6 * b7) + (in14 * b6 * a7) + (in15 * a6 * a7)
+ LOOP_UNROLLING(int, i, 0, 1, VEC_SIZE,
+ {
+ out_vals[0].s[i] = (in_vals[0].s[i * 4 + 0] * b[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals[0].s[i * 4 + 1] * a[0].s[i * 2] * b[0].s[i * 2 + 1]) + (in_vals[0].s[i * 4 + 2] * b[0].s[i * 2] * a[0].s[i * 2 + 1]) + (in_vals[0].s[i * 4 + 3] * a[0].s[i * 2] * a[0].s[i * 2 + 1]);
+ })
+#endif // defined(OFFSET) && defined(SCALE)
+
+ __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_step_x + y * out_stride_y;
+
+ if(x == get_global_size(0) - 1)
+ {
+#if VEC_SIZE == 1
+ VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER)
+ (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr);
+#else // VEC_SIZE == 1
+ VSTORE_PARTIAL(VEC_SIZE, VEC_SIZE_LEFTOVER)
+ (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr);
+#endif // VEC_SIZE == 1
+ }
+ else
+ {
+#if VEC_SIZE == 1
+ VSTORE(VEC_SIZE)
+ (out_vals[0].s[0], 0, (__global DATA_TYPE *)out_addr);
+#else // VEC_SIZE == 1
+ VSTORE(VEC_SIZE)
+ (out_vals[0].v, 0, (__global DATA_TYPE *)out_addr);
+#endif // VEC_SIZE == 1
+ }
} \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/scale_quantized.cl b/src/core/CL/cl_kernels/nchw/scale_quantized.cl
deleted file mode 100644
index 946ad65c14..0000000000
--- a/src/core/CL/cl_kernels/nchw/scale_quantized.cl
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-#include "warp_helpers_quantized.h"
-
-/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
- *
- * @param[in] coord 2D coordinates to transform.
- * @param[in] scale input/output scale ratio
- *
- * @return a float8 containing 4 2D transformed values in the input image.
- */
-inline const float8 transform_bilinear_quantized(const float2 coord, const float2 scale)
-{
- const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-#ifdef SAMPLING_POLICY_TOP_LEFT
- const float4 new_x = in_x_coords * (float4)(scale.s0);
- const float4 new_y = (float4)(coord.s1 * scale.s1);
- return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#elif SAMPLING_POLICY_CENTER
- const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
- const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
- return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-}
-
-/** Performs an affine transformation on an image interpolating with the BILINEAR method.
- *
- * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
- * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
- *
- * @param[in] in_ptr Pointer to the source image. Supported data types: QASYMM8.
- * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
- * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] input_width Input image width
- * @param[in] input_height Input image height
- * @param[in] scale_x The scale factor along x dimension
- * @param[in] scale_y The scale factor along y dimension
- */
-__kernel void scale_bilinear_quantized_nchw(
- IMAGE_DECLARATION(in),
- IMAGE_DECLARATION(out),
- const float input_width,
- const float input_height,
- const float scale_x,
- const float scale_y)
-{
- Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
- Image out = CONVERT_TO_IMAGE_STRUCT(out);
- const float2 r = (float2)(scale_x, scale_y);
- const float8 tc = transform_bilinear_quantized(get_current_coords_quantized(), r);
- vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr);
-} \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl
index 1ea5e73df1..69cbbcd5f3 100644
--- a/src/core/CL/cl_kernels/nhwc/scale.cl
+++ b/src/core/CL/cl_kernels/nhwc/scale.cl
@@ -22,7 +22,6 @@
* SOFTWARE.
*/
#include "helpers.h"
-#include "warp_helpers.h"
#if defined(DEPTH_OUT)
/** Performs scale on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel F32. (NHWC)
@@ -46,28 +45,20 @@
* @param[in] out_stride_z Stride of the destination image in Z dimension (in bytes)
* @param[in] out_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] input_width Input image width
- * @param[in] input_height Input image height
- * @param[in] scale_x The scale factor along x dimension
- * @param[in] scale_y The scale factor along y dimension
*/
__kernel void scale_nearest_neighbour_nhwc(
TENSOR4D_DECLARATION(in),
- TENSOR4D_DECLARATION(out),
- const float input_width,
- const float input_height,
- const float scale_x,
- const float scale_y)
+ TENSOR4D_DECLARATION(out))
{
Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
#ifdef SAMPLING_POLICY_TOP_LEFT
- float new_x = get_global_id(1) * scale_x;
- float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
+ float new_x = get_global_id(1) * SCALE_X;
+ float new_y = (get_global_id(2) % DEPTH_OUT) * SCALE_Y;
#elif SAMPLING_POLICY_CENTER
- float new_x = (get_global_id(1) + 0.5f) * scale_x;
- float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y;
+ float new_x = (get_global_id(1) + 0.5f) * SCALE_X;
+ float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * SCALE_Y;
#else /* SAMPLING_POLICY */
#error("Unsupported sampling policy");
#endif /* SAMPLING_POLICY */
@@ -75,8 +66,8 @@ __kernel void scale_nearest_neighbour_nhwc(
new_x = round(new_x);
new_y = round(new_y);
#endif /* ALIGN_CORNERS */
- const float clamped_x = clamp(new_x, 0.0f, input_width - 1);
- const float clamped_y = clamp(new_y, 0.0f, input_height - 1);
+ const float clamped_x = clamp(new_x, 0.0f, (float)SRC_WIDTH - 1);
+ const float clamped_y = clamp(new_y, 0.0f, (float)SRC_HEIGHT - 1);
*((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT)));
}
@@ -104,71 +95,81 @@ __kernel void scale_nearest_neighbour_nhwc(
* @param[in] out_stride_z Stride of the destination image in Z dimension (in bytes)
* @param[in] out_step_z dst_stride_y * number of elements along Z processed per workitem(in bytes)
* @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] input_width Input image width
- * @param[in] input_height Input image height
- * @param[in] scale_x The scale factor along x dimension
- * @param[in] scale_y The scale factor along y dimension
*
*/
__kernel void scale_bilinear_nhwc(
TENSOR4D_DECLARATION(in),
- TENSOR4D_DECLARATION(out),
- const float input_width,
- const float input_height,
- const float scale_x,
- const float scale_y)
+ TENSOR4D_DECLARATION(out))
{
Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
#ifdef SAMPLING_POLICY_TOP_LEFT
- const float new_x = get_global_id(1) * scale_x;
- const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
+ const float new_x = get_global_id(1) * SCALE_X;
+ const float new_y = (get_global_id(2) % DEPTH_OUT) * SCALE_Y;
#elif SAMPLING_POLICY_CENTER
- const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
- const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f;
+ const float new_x = (get_global_id(1) + 0.5f) * SCALE_X - 0.5f;
+ const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * SCALE_Y - 0.5f;
#else /* SAMPLING_POLICY */
#error("Unsupported sampling policy");
#endif /* SAMPLING_POLICY */
const float new_xf = floor(new_x);
const float new_yf = floor(new_y);
- const float clamped_x = clamp(new_xf, 0.0f, input_width - 1);
- const float clamped_x1 = clamp(new_xf + 1, 0.0f, input_width - 1);
- const float clamped_y = clamp(new_yf, 0.0f, input_height - 1);
- const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1);
+ const float clamped_x = clamp(new_xf, 0.0f, SRC_WIDTH - 1.f);
+ const float clamped_x1 = clamp(new_xf + 1, 0.0f, SRC_WIDTH - 1.f);
+ const float clamped_y = clamp(new_yf, 0.0f, SRC_HEIGHT - 1.f);
+ const float clamped_y1 = clamp(new_yf + 1, 0.0f, SRC_HEIGHT - 1.f);
+
+#if defined(OFFSET) && defined(SCALE)
+#define IN_DATA_TYPE int
+#else // defined(OFFSET) && defined(SCALE)
+#define IN_DATA_TYPE float
+#endif // defined(OFFSET) && defined(SCALE)
#ifndef BORDER_MODE_REPLICATE
- const bool check_x = (0.f <= new_xf && new_xf < input_width);
- const bool check_x1 = (-1.f <= new_xf && new_xf < input_width - 1);
- const bool check_y = (0.f <= new_yf && new_yf < input_height);
- const bool check_y1 = (-1.f <= new_yf && new_yf < input_height - 1);
- const float ins_0 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
- (get_global_id(2) / DEPTH_OUT)))),
- check_x && check_y);
- const float ins_1 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y),
- (get_global_id(2) / DEPTH_OUT)))),
- check_x1 && check_y);
- const float ins_2 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1),
- (get_global_id(2) / DEPTH_OUT)))),
- check_x && check_y1);
- const float ins_3 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1),
- (get_global_id(2) / DEPTH_OUT)))),
- check_x1 && check_y1);
- float4 ins = (float4)(ins_0, ins_1, ins_2, ins_3);
+ const bool check_x = (0.f <= new_xf && new_xf < (float)SRC_WIDTH);
+ const bool check_x1 = (-1.f <= new_xf && new_xf < SRC_WIDTH - 1.f);
+ const bool check_y = (0.f <= new_yf && new_yf < (float)SRC_HEIGHT);
+ const bool check_y1 = (-1.f <= new_yf && new_yf < SRC_HEIGHT - 1.f);
+
+ const IN_DATA_TYPE ins_0 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
+ (get_global_id(2) / DEPTH_OUT)))),
+ check_x && check_y);
+ const IN_DATA_TYPE ins_1 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y),
+ (get_global_id(2) / DEPTH_OUT)))),
+ check_x1 && check_y);
+ const IN_DATA_TYPE ins_2 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1),
+ (get_global_id(2) / DEPTH_OUT)))),
+ check_x && check_y1);
+ const IN_DATA_TYPE ins_3 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1),
+ (get_global_id(2) / DEPTH_OUT)))),
+ check_x1 && check_y1);
+ VEC_DATA_TYPE(IN_DATA_TYPE, 4)
+ ins = (VEC_DATA_TYPE(IN_DATA_TYPE, 4))(ins_0, ins_1, ins_2, ins_3);
#else /* BORDER_MODE_REPLICATE */
- float4 ins = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
- *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
- *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
- *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
+ VEC_DATA_TYPE(IN_DATA_TYPE, 4)
+ ins = (VEC_DATA_TYPE(IN_DATA_TYPE, 4))(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+ *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+ *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
+ *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
#endif /* BORDER_MODE_REPLICATE */
const float a = new_x - new_xf;
const float b = 1.f - a;
const float a1 = new_y - new_yf;
const float b1 = 1.f - a1;
+
+#if defined(OFFSET) && defined(SCALE)
+ const float4 insf32 = convert_float4(ins - (int4)OFFSET) * (float4)SCALE;
+ const float fr = ((insf32.s0 * b * b1) + (insf32.s1 * a * b1) + (insf32.s2 * b * a1) + (insf32.s3 * a * a1));
+ DATA_TYPE res = CONVERT_SAT(convert_int_sat_rtp(fr / SCALE) + OFFSET, DATA_TYPE);
+
+ *((__global DATA_TYPE *)out.ptr) = res;
+#else // defined(OFFSET) && defined(SCALE)
const float fr = ((ins.s0 * b * b1) + (ins.s1 * a * b1) + (ins.s2 * b * a1) + (ins.s3 * a * a1));
*((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE);
+#endif // defined(OFFSET) && defined(SCALE)
}
#endif /* defined(DEPTH_OUT) */ \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/scale_quantized.cl b/src/core/CL/cl_kernels/nhwc/scale_quantized.cl
deleted file mode 100644
index de9bb607b0..0000000000
--- a/src/core/CL/cl_kernels/nhwc/scale_quantized.cl
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-#include "warp_helpers_quantized.h"
-
-#if defined(DEPTH_OUT)
-/** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
- *
- * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
- * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
- * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
- * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
- * @note The value to be used at the edges of the images shoud be given as a preprocessor argument using -DCONSTANT_VALUE=value.
- *
- * @param[in] in_ptr Pointer to the source image. Supported data types: QASYMM8.
- * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in_stride_z Stride of the source image in Z dimension (in bytes)
- * @param[in] in_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in_ptr
- * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] out_stride_z Stride of the destination image in Z dimension (in bytes)
- * @param[in] out_step_z dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] input_width Input image width
- * @param[in] input_height Input image height
- * @param[in] scale_x The scale factor along x dimension
- * @param[in] scale_y The scale factor along y dimension
- * @param[in] constant_border_value Constant border value to use
- */
-__kernel void scale_bilinear_quantized_nhwc(
- TENSOR4D_DECLARATION(in),
- TENSOR4D_DECLARATION(out),
- const float input_width,
- const float input_height,
- const float scale_x,
- const float scale_y)
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
-
-#ifdef SAMPLING_POLICY_TOP_LEFT
- const float new_x = get_global_id(1) * scale_x;
- const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
-#elif SAMPLING_POLICY_CENTER
- const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
- const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f;
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-
- const float new_xf = floor(new_x);
- const float new_yf = floor(new_y);
- const float clamped_x = clamp(new_xf, 0.0f, input_width - 1);
- const float clamped_x1 = clamp(new_xf + 1, 0.0f, input_width - 1);
- const float clamped_y = clamp(new_yf, 0.0f, input_height - 1);
- const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1);
-
-#ifndef BORDER_MODE_REPLICATE
- const bool check_x = (0.f <= new_xf && new_xf < input_width);
- const bool check_x1 = (-1.f <= new_xf && new_xf < input_width - 1);
- const bool check_y = (0.f <= new_yf && new_yf < input_height);
- const bool check_y1 = (-1.f <= new_yf && new_yf < input_height - 1);
- const int ins_0 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
- (get_global_id(2) / DEPTH_OUT)))),
- check_x && check_y);
- const int ins_1 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y),
- (get_global_id(2) / DEPTH_OUT)))),
- check_x1 && check_y);
- const int ins_2 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1),
- (get_global_id(2) / DEPTH_OUT)))),
- check_x && check_y1);
- const int ins_3 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1),
- (get_global_id(2) / DEPTH_OUT)))),
- check_x1 && check_y1);
- int4 ins = (int4)(ins_0, ins_1, ins_2, ins_3);
-#else /* BORDER_MODE_REPLICATE */
- int4 ins = (int4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
- *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
- *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
- *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
-#endif /* BORDER_MODE_REPLICATE */
-
- const float a = new_x - new_xf;
- const float b = 1.f - a;
- const float a1 = new_y - new_yf;
- const float b1 = 1.f - a1;
- const float4 insf32 = convert_float4(ins - (int4)OFFSET) * (float4)SCALE;
-
- const float fr = ((insf32.s0 * b * b1) + (insf32.s1 * a * b1) + (insf32.s2 * b * a1) + (insf32.s3 * a * a1));
-
- DATA_TYPE res = CONVERT_SAT(convert_int_sat_rtp(fr / SCALE) + OFFSET, DATA_TYPE);
-
- *((__global DATA_TYPE *)out.ptr) = res;
-}
-#endif /* defined(DEPTH_OUT) */ \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/warp_helpers.h b/src/core/CL/cl_kernels/warp_helpers.h
index 005861ddfa..642483ab3c 100644
--- a/src/core/CL/cl_kernels/warp_helpers.h
+++ b/src/core/CL/cl_kernels/warp_helpers.h
@@ -63,12 +63,6 @@ inline const VEC_DATA_TYPE(DATA_TYPE, 4) read_texels4(const Image *in, const int
*((__global DATA_TYPE *)offset(in, coords.s6, coords.s7)));
}
-/** Returns the current thread coordinates. */
-inline const float2 get_current_coords()
-{
- return (float2)(get_global_id(0) * 4, get_global_id(1));
-}
-
/** Given a texel coordinates this function will return the following array of coordinates:
* [ P, right neighbour, below neighbour, below right neighbour ]
*
diff --git a/src/core/CL/cl_kernels/warp_helpers_quantized.h b/src/core/CL/cl_kernels/warp_helpers_quantized.h
deleted file mode 100644
index b10890aff0..0000000000
--- a/src/core/CL/cl_kernels/warp_helpers_quantized.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-
-/** Clamps the given coordinates to the borders according to the border size.
- *
- * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
- * @param[in] width Width of the image
- * @param[in] height Height of the image
- * @param[in] border_size Border size of the image
- *
- */
-inline const float8 clamp_to_border_with_size_quantized(float8 coords, const float width, const float height, const float border_size)
-{
- const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
- const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
- return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
-}
-
-/** Clamps the given coordinates to the borders.
- *
- * @param[in] coords Vector of 2D coordinates to clamp. Even positions are X coords, odd positions are Y coords.
- * @param[in] width Width of the image
- * @param[in] height Height of the image
- *
- */
-inline const float8 clamp_to_border_quantized(float8 coords, const float width, const float height)
-{
- return clamp_to_border_with_size_quantized(coords, width, height, 1);
-}
-
-/** Given a texel coordinates this function will return the following array of coordinates:
- * [ P, right neighbour, below neighbour, below right neighbour ]
- *
- * @note No checks to see if the coordinates are out of the image are done here.
- *
- * @param[in] coord Input coordinates
- *
- * @return vector of 8 floats with the coordinates, even positions are x and odd y.
- */
-inline const float8 get_neighbour_coords_quantized(const float2 coord)
-{
- return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
-}
-
-/** Returns the current thread coordinates. */
-inline const float2 get_current_coords_quantized()
-{
- return (float2)(get_global_id(0) * 4, get_global_id(1));
-}
-
-/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
- *
- * @param[in] in Pointer to the source image.
- * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y.
- * @param[in] width Width of the image
- * @param[in] height Height of the image
- * @param[in] border_size Border size
- * @param[in] scale Scale value
- * @param[in] offset_qasymm Offset value
- */
-inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border_quantized(const Image *in, const float8 coords, const float width, const float height, const float border_size,
- const float scale, const int offset_qasymm)
-{
- // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
-
- // Sets the 4x4 coordinates for each of the four input texels
- const float8 fc = floor(coords);
- const float16 c1 = (float16)(
- clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s0, fc.s1)), width, height, border_size),
- clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s2, fc.s3)), width, height, border_size));
- const float16 c2 = (float16)(
- clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s4, fc.s5)), width, height, border_size),
- clamp_to_border_with_size_quantized(get_neighbour_coords_quantized((float2)(fc.s6, fc.s7)), width, height, border_size));
-
- // Loads the values from the input image
- const int16 t = (int16)(
- /* tl, tr, bl, br */
- * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
- *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
- *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
- *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
- *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
- *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
- *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
- *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
-
- const float16 inf32 = convert_float16(t - (int16)offset_qasymm) * (float16)scale;
-
- const float8 a = coords - fc;
- const float8 b = ((float8)(1.f)) - a;
- const float4 fr = (float4)(
- ((inf32.s0 * b.s0 * b.s1) + (inf32.s1 * a.s0 * b.s1) + (inf32.s2 * b.s0 * a.s1) + (inf32.s3 * a.s0 * a.s1)),
- ((inf32.s4 * b.s2 * b.s3) + (inf32.s5 * a.s2 * b.s3) + (inf32.s6 * b.s2 * a.s3) + (inf32.s7 * a.s2 * a.s3)),
- ((inf32.s8 * b.s4 * b.s5) + (inf32.s9 * a.s4 * b.s5) + (inf32.sa * b.s4 * a.s5) + (inf32.sb * a.s4 * a.s5)),
- ((inf32.sc * b.s6 * b.s7) + (inf32.sd * a.s6 * b.s7) + (inf32.se * b.s6 * a.s7) + (inf32.sf * a.s6 * a.s7)));
-
- const VEC_DATA_TYPE(DATA_TYPE, 4) res = CONVERT_SAT(convert_int4_sat_rtp(fr / scale) + offset_qasymm, VEC_DATA_TYPE(DATA_TYPE, 4));
-
- return res;
-}
-
-/** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
- *
- * @param[in] in Pointer to the source image.
- * @param[in] coords Vector of four 2D coordinates. Even pos is x and odd y.
- * @param[in] width Width of the image
- * @param[in] height Height of the image
- * @param[in] scale Scale value
- * @param[in] offset_qasymm Offset value
- */
-inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_quantized(const Image *in, const float8 coords, const float width, const float height, const float scale, const int offset_qasymm)
-{
- return bilinear_interpolate_with_border_quantized(in, coords, width, height, 1, scale, offset_qasymm);
-}
diff --git a/src/core/gpu/cl/ClKernelLibrary.cpp b/src/core/gpu/cl/ClKernelLibrary.cpp
index 05beb5f63d..4a9ba874b1 100644
--- a/src/core/gpu/cl/ClKernelLibrary.cpp
+++ b/src/core/gpu/cl/ClKernelLibrary.cpp
@@ -394,7 +394,6 @@ const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
{ "reorg_layer_nchw", "nchw/reorg_layer.cl" },
{ "scale_nearest_neighbour_nchw", "nchw/scale.cl" },
{ "scale_bilinear_nchw", "nchw/scale.cl" },
- { "scale_bilinear_quantized_nchw", "nchw/scale_quantized.cl" },
{ "space_to_batch_nchw", "nchw/space_to_batch.cl" },
{ "space_to_batch_static_nchw", "nchw/space_to_batch.cl" },
{ "space_to_depth_nchw", "nchw/space_to_depth.cl" },
@@ -455,7 +454,6 @@ const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
{ "reorg_layer_nhwc", "nhwc/reorg_layer.cl" },
{ "scale_nearest_neighbour_nhwc", "nhwc/scale.cl" },
{ "scale_bilinear_nhwc", "nhwc/scale.cl" },
- { "scale_bilinear_quantized_nhwc", "nhwc/scale_quantized.cl" },
{ "space_to_batch_nhwc", "nhwc/space_to_batch.cl" },
{ "space_to_batch_static_nhwc", "nhwc/space_to_batch.cl" },
{ "space_to_depth_nhwc", "nhwc/space_to_depth.cl" },
@@ -827,10 +825,6 @@ const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
#include "./cl_kernels/nchw/scale.clembed"
},
{
- "nchw/scale_quantized.cl",
-#include "./cl_kernels/nchw/scale_quantized.clembed"
- },
- {
"nchw/space_to_batch.cl",
#include "./cl_kernels/nchw/space_to_batch.clembed"
},
@@ -926,10 +920,6 @@ const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
#include "./cl_kernels/nhwc/scale.clembed"
},
{
- "nhwc/scale_quantized.cl",
-#include "./cl_kernels/nhwc/scale_quantized.clembed"
- },
- {
"nhwc/space_to_batch.cl",
#include "./cl_kernels/nhwc/space_to_batch.clembed"
},
diff --git a/src/core/gpu/cl/kernels/ClScaleKernel.cpp b/src/core/gpu/cl/kernels/ClScaleKernel.cpp
index 57ca331539..ee4ee22aa0 100644
--- a/src/core/gpu/cl/kernels/ClScaleKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClScaleKernel.cpp
@@ -50,10 +50,10 @@ inline std::pair<float, float> calculate_scale_factors(const ITensorInfo *src, c
const unsigned int dst_width = dst->dimension(idx_width);
const unsigned int dst_height = dst->dimension(idx_height);
- float wr = arm_compute::scale_utils::calculate_resize_ratio(src_width, dst_width, align_corners);
- float hr = arm_compute::scale_utils::calculate_resize_ratio(src_height, dst_height, align_corners);
+ float scale_x = arm_compute::scale_utils::calculate_resize_ratio(src_width, dst_width, align_corners);
+ float scale_y = arm_compute::scale_utils::calculate_resize_ratio(src_height, dst_height, align_corners);
- return std::make_pair(wr, hr);
+ return std::make_pair(scale_x, scale_y);
}
Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
@@ -65,78 +65,22 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
ARM_COMPUTE_RETURN_ERROR_ON(dst == src);
ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
+ ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && !is_data_type_quantized_asymmetric(src->data_type()));
- float wr = 0.f;
- float hr = 0.f;
+ float scale_x = 0.f;
+ float scale_y = 0.f;
const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
- std::tie(wr, hr) = calculate_scale_factors(src, dst, data_layout, info.align_corners);
+ std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, data_layout, info.align_corners);
- ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA && (wr > 1.f || hr > 1.f));
+ ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA && (scale_x > 1.f || scale_y > 1.f));
return Status{};
}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info, BorderSize &border)
-{
- Window win{};
- bool window_changed{};
- unsigned int num_elems_processed_per_iteration = 0;
- const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-
- switch(data_layout)
- {
- case DataLayout::NCHW:
- {
- if(info.border_mode == BorderMode::UNDEFINED)
- {
- border = BorderSize(0);
- }
-
- num_elems_processed_per_iteration = 4;
- // Configure kernel window
- win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
- AccessWindowStatic input_access(src,
- -border.left, -border.top,
- src->dimension(0) + border.right,
- src->dimension(1) + border.bottom);
- AccessWindowHorizontal output_access(dst, 0, num_elems_processed_per_iteration);
-
- output_access.set_valid_region(win, calculate_valid_region_scale(*src,
- dst->tensor_shape(),
- info.interpolation_policy,
- info.sampling_policy,
- info.border_mode == BorderMode::UNDEFINED));
-
- window_changed = update_window_and_padding(win, input_access, output_access);
- }
- break;
- case DataLayout::NHWC:
- {
- // Configure kernel window
- win = calculate_max_window(*dst, Steps());
- }
- break;
- default:
- ARM_COMPUTE_ERROR("Data layout not supported");
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
} // namespace
-BorderSize ClScaleKernel::border_size() const
-{
- return BorderSize(static_cast<size_t>(_data_layout == DataLayout::NCHW));
-}
-
Status ClScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, info));
- const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
- BorderSize border = BorderSize(static_cast<size_t>(data_layout == DataLayout::NCHW));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), info, border).first);
-
return Status{};
}
@@ -153,37 +97,45 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
// Info required for the static tuning
_data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
- float wr = 0.f;
- float hr = 0.f;
- std::tie(wr, hr) = calculate_scale_factors(src, dst, _data_layout, info.align_corners);
- const bool call_quantized_kernel = is_data_type_quantized_asymmetric(src->data_type()) && info.interpolation_policy == InterpolationPolicy::BILINEAR;
-
- // Compute actual border size
- BorderSize border = border_size();
const bool is_nhwc = _data_layout == DataLayout::NHWC;
+ float scale_x = 0.f;
+ float scale_y = 0.f;
+ std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, _data_layout, info.align_corners);
+ const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(src->data_type()) && info.interpolation_policy == InterpolationPolicy::BILINEAR;
+
// Area interpolation behaves as Nearest Neighbour in case of up-sampling
auto interpolation_policy_to_use = info.interpolation_policy;
- if(info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+ if(info.interpolation_policy == InterpolationPolicy::AREA && scale_x <= 1.f && scale_y <= 1.f)
{
interpolation_policy_to_use = InterpolationPolicy::NEAREST_NEIGHBOR;
}
- // Configure kernel window
- auto win_config = validate_and_configure_window(src, dst, info, border);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
// Create kernel
+ const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const unsigned int src_width = src->dimension(idx_width);
+ const unsigned int src_height = src->dimension(idx_height);
+ const unsigned int dst_width = dst->dimension(idx_width);
+ const unsigned int vec_size = adjust_vec_size(is_nhwc ? 1 : 4, dst_width);
+ const unsigned int vec_size_leftover = (dst_width % vec_size);
+
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type()));
- build_opts.add_option("-DBORDER_SIZE=" + support::cpp11::to_string(border.right));
+ build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width));
+ build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height));
+ build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x));
+ build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y));
+
build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
+ build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
+ build_opts.add_option_if(!is_nhwc, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+ build_opts.add_option_if(!is_nhwc, "-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0) ? support::cpp11::to_string(vec_size) : support::cpp11::to_string(vec_size_leftover)));
build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(dst->dimension(2)));
build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
- if(call_quantized_kernel)
+ if(is_qasymm_bilinear)
{
const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale));
@@ -191,26 +143,16 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
}
std::string interpolation_name = string_from_interpolation_policy(interpolation_policy_to_use);
std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
- std::string kernel_name = "scale_" + interpolation_name;
- kernel_name += call_quantized_kernel ? "_quantized_" : "_";
+ std::string kernel_name = "scale_" + interpolation_name + "_";
kernel_name += lower_string(string_from_data_layout(_data_layout));
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
- if(is_nhwc)
- {
- ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
- }
- const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
- unsigned int idx = is_nhwc ? 2 * num_arguments_per_4D_tensor() : 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
- const unsigned int src_width = src->dimension(idx_width);
- const unsigned int dst_height = src->dimension(idx_height);
+ // Configure kernel window
+ Window win = calculate_max_window(*dst, Steps(vec_size));
+ ICLKernel::configure_internal(win);
- _kernel.setArg<float>(idx++, src_width);
- _kernel.setArg<float>(idx++, dst_height);
- _kernel.setArg<float>(idx++, wr);
- _kernel.setArg<float>(idx++, hr);
+ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
// Set config_id for enabling LWS tuning
_config_id = "scale_";
diff --git a/src/core/gpu/cl/kernels/ClScaleKernel.h b/src/core/gpu/cl/kernels/ClScaleKernel.h
index 8333c7d6c0..6674931296 100644
--- a/src/core/gpu/cl/kernels/ClScaleKernel.h
+++ b/src/core/gpu/cl/kernels/ClScaleKernel.h
@@ -59,7 +59,6 @@ public:
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info);
// Inherited methods overridden:
- BorderSize border_size() const override;
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
private:
diff --git a/src/runtime/gpu/cl/operators/ClScale.cpp b/src/runtime/gpu/cl/operators/ClScale.cpp
index 4730c8a16e..5c8d754c7e 100644
--- a/src/runtime/gpu/cl/operators/ClScale.cpp
+++ b/src/runtime/gpu/cl/operators/ClScale.cpp
@@ -41,12 +41,7 @@ void ClScale::configure(const CLCompileContext &compile_context, ITensorInfo *sr
k->set_target(CLScheduler::get().target());
k->configure(compile_context, src, dst, info);
_kernel = std::move(k);
- if(!_kernel->border_size().empty())
- {
- auto b = std::make_unique<CLFillBorderKernel>();
- b->configure(compile_context, src, _kernel->border_size(), info.border_mode, info.constant_border_value);
- _border_handler = std::move(b);
- }
+
// Tune kernel
CLScheduler::get().tune_kernel_static(*_kernel);
}
@@ -59,10 +54,6 @@ Status ClScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const S
void ClScale::run(ITensorPack &tensors)
{
ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
- if(!_kernel->border_size().empty())
- {
- CLScheduler::get().enqueue_op(*_border_handler.get(), tensors, false);
- }
CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
}
} // namespace opencl
diff --git a/src/runtime/gpu/cl/operators/ClScale.h b/src/runtime/gpu/cl/operators/ClScale.h
index 905c43a41c..0ff78640f7 100644
--- a/src/runtime/gpu/cl/operators/ClScale.h
+++ b/src/runtime/gpu/cl/operators/ClScale.h
@@ -34,7 +34,6 @@ namespace opencl
{
/** Basic function to simulate a scale layer. This function calls the following OpenCL kernels:
*
- * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
* -# @ref kernels::ClScaleKernel
*/
class ClScale : public IClOperator
@@ -61,9 +60,6 @@ public:
// Inherited method overridden
void run(ITensorPack &tensors) override;
-
-protected:
- std::unique_ptr<ICLKernel> _border_handler{ nullptr };
};
} // namespace opencl
} // namespace arm_compute
diff --git a/tests/validation/CL/Scale.cpp b/tests/validation/CL/Scale.cpp
index 2b34f1f353..845408a0b2 100644
--- a/tests/validation/CL/Scale.cpp
+++ b/tests/validation/CL/Scale.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -186,16 +186,6 @@ TEST_CASE(AlignedCornerNotSupported, framework::DatasetMode::ALL)
ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
}
-TEST_CASE(WindowShrink, framework::DatasetMode::ALL)
-{
- const auto input = TensorInfo{ TensorShape(37U, 37U, 2U), 1, DataType::F32 };
- const auto output = TensorInfo{ TensorShape(39U, 55U, 2U), 1, DataType::F32 };
- Status result{};
-
- result = CLScale::validate(&input.clone()->set_is_resizable(false), &output.clone()->set_is_resizable(false), ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
- ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
-}
-
TEST_CASE(IncorrectScaleFactor, framework::DatasetMode::ALL)
{
const auto input = TensorInfo{ TensorShape(28U, 33U, 2U), 1, DataType::F32 };