//
// This confidential and proprietary software may be used only as
// authorised by a licensing agreement from ARM Limited
// (C) COPYRIGHT 2020-2021 ARM Limited
// ALL RIGHTS RESERVED
// The entire notice above must be reproduced on all authorised
// copies and copies may only be made to the extent permitted
// by a licensing agreement from ARM Limited.

=== Image Operators

==== RESIZE

Resizes a tensor. Resize is only allowed in the H and W dimensions.


The height dimension is scaled by factor (scale_y_n/scale_y_d).
The width dimension is scaled by factor (scale_x_n/scale_x_d).

The NEAREST_NEIGHBOR mode returns the value of the input tensor closest to the
calculated sample position for both floating-point and integer data formats.

Floating-point BILINEAR mode returns a bilinearly interpolated output value
based on the four closest input sample positions.

For integer BILINEAR interpolation mode, the output value must
be scaled by 1/(scale_y_n * scale_x_n) in a following operation to
complete the interpolation (for example with a RESCALE operator).

The following examples show practical uses of the parameters:

* For approximate uniform input sampling between (0, 0) and (IH - 1, IW - 1) set
** scale_y_n/scale_y_d = (OH - 1)/(IH - 1) as integer ratios
** scale_x_n/scale_x_d = (OW - 1)/(IW - 1) as integer ratios
** offset_x = 0, offset_y = 0, border_x = 0, border_y = 0

* For power of two upscale [OH - 1,OW - 1] = (1 << k) * [IH - 1, IW - 1],
sampling between (0,0) and (IH - 1,IW - 1), set:
** scale_y_n = (1 << k), scale_y_d = 1, offset_y = 0, border_y = 0
** scale_x_n = (1 << k), scale_x_d = 1, offset_x = 0, border_x = 0

* For power of two upscale [OH,OW] = (1 << k) * [IH,IW],
sampling range approximately (-0.5, -0.5) to (IH - 0.5, IW - 0.5), set:
** scale_y_n = 2 << k, scale_y_d = 2, offset_y = -(1 << k) + 1, border_y = (1 << k) - 1
** scale_x_n = 2 << k, scale_x_d = 2, offset_x = -(1 << k) + 1, border_x = (1 << k) - 1

The output dimensions can be derived from the input dimensions by inverting
the scale as described in the pseudocode. The [border_y, border_x] values
adjust the output size to allow fractional sampling beyond integer
input position (IH - 1,IW - 1).

*Arguments:*

|===
|Argument|Type|Name|Shape|Description

|Input|in_t*|input|[N,IH,IW,C]|Input tensor
|Attribute|int16_t *|scale|[4]|[scale_y_n, scale_y_d, scale_x_n, scale_x_d]
|Attribute|int16_t *|offset|[2]|[offset_y, offset_x]
|Attribute|int32_t* |border|[2]|[border_y, border_x]
|Attribute|mode_t|mode|-|BILINEAR or NEAREST
|Output|out_t*|output|[N,OH,OW,C]|Output tensor
|===

*Operation Function*

[source,c++]
----
// Ensure the image size is supported by GPU APIs and that for integer
// implementations, position * stride does not overflow int32_t.
ERROR_IF(max(OH,OW,IH,IW) >= 16384);
ERROR_IF(scale_y_n <= 0 || scale_y_d <= 0 || scale_x_n <= 0 || scale_x_d <= 0);
// if in_t=int8_t ensure that an int32_t accumulator can be used
ERROR_IF(scale_y_n > (1 << 11) || scale_x_n > (1 << 11));
// set a consistent lower limit of 1/16 downscale to simplify implementations
ERROR_IF(scale_y_d >= 16 * scale_y_n || scale_x_d >= 16 * scale_x_n);
ERROR_IF(offset_y < -scale_y_n || offset_y >= 16 * scale_y_n);
ERROR_IF(offset_x < -scale_x_n || offset_x >= 16 * scale_x_n);
ERROR_IF(border_y < -16 * scale_y_n || border_y >= scale_y_n);
ERROR_IF(border_x < -16 * scale_x_n || border_x >= scale_x_n);
ERROR_IF(OH != idiv_check((IH - 1) * scale_y_n - offset_y + border_y, scale_y_d) + 1);
ERROR_IF(OW != idiv_check((IW - 1) * scale_x_n - offset_x + border_x, scale_x_d) + 1);
for_each(0 <= n < N, 0 <= oy < OH, 0 <= ox < OW; 0 <= c < C) {
    out_t acc;
    resize_t dx, dy;
    resize_t unit_x, unit_y;

    unit_x = (is_floating_point(resize_t)) ? 1.0 : scale_x_n;
    unit_y = (is_floating_point(resize_t)) ? 1.0 : scale_y_n;

    int32_t y = oy * scale_y_d + offset_y;
    int32_t x = ox * scale_x_d + offset_x;
    int16_t iy = floor(y / scale_y_n);
    int16_t ix = floor(x / scale_x_n);

    if (is_floating_point(resize_t)) {
        dy = ((resize_t)y / (resize_t)scale_y_n) - iy;
        dx = ((resize_t)x / (resize_t)scale_x_n) - ix;
    } else {
        dy = y - iy * scale_y_n;
        dx = y - ix * scale_x_n;
    }
    // Note that -1 <= iy < IH and -1 <= ix < IW
    int16_t iy0 = apply_max(iy, 0);
    int16_t iy1 = apply_min(iy + 1, IH - 1);
    int16_t ix0 = apply_max(ix, 0);
    int16_t ix1 = apply_min(ix + 1, IW - 1);
    if (mode==BILINEAR) {
        in_t v00 = tensor_read<in_t>(input, [N,IH,IW,C], [n,iy0,ix0,c]);
        in_t v01 = tensor_read<in_t>(input, [N,IH,IW,C], [n,iy0,ix1,c]);
        in_t v10 = tensor_read<in_t>(input, [N,IH,IW,C], [n,iy1,ix0,c]);
        in_t v11 = tensor_read<in_t>(input, [N,IH,IW,C], [n,iy1,ix1,c]);
        acc  = v00 * (unit_y - dy) * (unit_x - dx);
        acc += v01 * (unit_y - dy) * dx;
        acc += v10 * dy * (unit_x - dx);
        acc += v11 * dy * dx;
        tensor_write<out_t>(output, [N,OH,OW,C], [n,oy,ox,c], acc);
    } else if (mode==NEAREST) {
        int32_t iy, ix;
        if (is_floating_point(resize_t)) {
            iy = (dy >= 0.5) ? iy1 : iy0;
            ix = (dx >= 0.5) ? ix1 : ix0;
        } else {
            iy = (2 * dy >= scale_y_n) ? iy1 : iy0;
            ix = (2 * dx >= scale_x_n) ? ix1 : ix0;
        }
        in_t v = tensor_read<in_t>(input, [N,IH,IW,C], [n,iy,ix,c]);
        tensor_write<out_t>(output, [N,OH,OW,C], [n,oy,ox,c], v);
    }
}
----

*Supported Data Types:*

|===
|Profile|Mode|resize_t|in_t|out_t

|Any|signed 8,  bilinear|int16_t|int8_t|int32_t
|Any|signed 8,  nearest |int16_t|int8_t|int8_t
|Any|signed 16, bilinear|int16_t|int16_t|int48_t
|Any|signed 16, nearest |int16_t|int16_t|int16_t
|MI,MT|fp16|fp32_t|fp16_t|fp16_t
|MI,MT|bf16|fp32_t|bf16_t|bf16_t
|MI,MT|fp32|fp32_t|fp32_t|fp32_t
|===

*Resize Modes:*
|===
|Mode|Description

|NEAREST|Nearest Neighbor
|BILINEAR|Bilinear interpoloation
|===