aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2021-09-08 17:14:19 +0100
committerGian Marco Iodice <gianmarco.iodice@arm.com>2021-09-14 15:42:30 +0000
commit7bc1a778520f3a8477d88819faeaca8ff77859fa (patch)
tree2c31351dfd46fee7ebf0603346200e44adb01900
parent67b9d7f3bff0d67a814b2d59571b0df3c0680498 (diff)
downloadComputeLibrary-7bc1a778520f3a8477d88819faeaca8ff77859fa.tar.gz
Optimize ClScaleKernel on NHWC (f32/f16/int8)
The new kernel performs the computation on multiples elements. The OpenCL kernel has been re-implemented using the new TILE macros Resolves COMPMID-4803,COMPMID-4804 Change-Id: Iac8fead65e21b64567a05dbc4fbaa61d362443f9 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6235 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Giorgio Arena <giorgio.arena@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--src/core/CL/cl_kernels/nhwc/scale.cl345
-rw-r--r--src/gpu/cl/kernels/ClScaleKernel.cpp89
2 files changed, 293 insertions, 141 deletions
diff --git a/src/core/CL/cl_kernels/nhwc/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl
index 69cbbcd5f3..21579aed9f 100644
--- a/src/core/CL/cl_kernels/nhwc/scale.cl
+++ b/src/core/CL/cl_kernels/nhwc/scale.cl
@@ -22,154 +22,257 @@
* SOFTWARE.
*/
#include "helpers.h"
+#include "tile_helpers.h"
-#if defined(DEPTH_OUT)
-/** Performs scale on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel F32. (NHWC)
+//! @cond Doxygen_Suppress
+/** Performs scale on a tensor by interpolating with the NEAREAST NEIGHBOUR method. (NHWC)
*
* @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
+ * @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0)
+ * @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time
+ * @note The scale value to apply on the source width must be passed at compile using -DSCALE_X (e.g., -DSCALE_X=0.5)
+ * @note The scale value to apply on the source height must be passed at compile using -DSCALE_Y (e.g., -DSCALE_Y=0.5)
+ * @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time
*
- * @param[in] in_ptr Pointer to the source image. Supported data types: U8/S16/F16/F32.
- * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in_stride_z Stride of the source image in Z dimension (in bytes)
- * @param[in] in_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in_ptr
- * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] out_stride_z Stride of the destination image in Z dimension (in bytes)
- * @param[in] out_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
+ //! @endcond
__kernel void scale_nearest_neighbour_nhwc(
- TENSOR4D_DECLARATION(in),
- TENSOR4D_DECLARATION(out))
+ TENSOR4D(src, SRC_TENSOR_TYPE),
+ TENSOR4D(dst, DST_TENSOR_TYPE))
{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
+ // All the tensor dimensions are passed at compile time.
+ // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _ISCALE_X SCALE_X
+#define _ISCALE_Y SCALE_Y
+
+ const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+ const int xo = GET_SPATIAL_IDX(1, 1, 0); // WIDTH
+#if defined(BATCHED_EXECUTION)
+ const int yo = GET_SPATIAL_IDX(2, 1, 0) % _IDST_HEIGHT; // HEIGHT
+ const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX
+#else // defined(BATCHED_EXECUTION)
+ const int yo = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
+ const int bout = 0; // BATCH SIZE IDX
+#endif // defined(BATCHED_EXECUTION)
#ifdef SAMPLING_POLICY_TOP_LEFT
- float new_x = get_global_id(1) * SCALE_X;
- float new_y = (get_global_id(2) % DEPTH_OUT) * SCALE_Y;
+ float xi_f = (xo * (float)SCALE_X);
+ float yi_f = (yo * (float)SCALE_Y);
#elif SAMPLING_POLICY_CENTER
- float new_x = (get_global_id(1) + 0.5f) * SCALE_X;
- float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * SCALE_Y;
-#else /* SAMPLING_POLICY */
+ float xi_f = ((xo + 0.5f) * (float)SCALE_X);
+ float yi_f = ((yo + 0.5f) * (float)SCALE_Y);
+#else // SAMPLING_POLICY
#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
+#endif // SAMPLING_POLICY
+
#ifdef ALIGN_CORNERS
- new_x = round(new_x);
- new_y = round(new_y);
-#endif /* ALIGN_CORNERS */
- const float clamped_x = clamp(new_x, 0.0f, (float)SRC_WIDTH - 1);
- const float clamped_y = clamp(new_y, 0.0f, (float)SRC_HEIGHT - 1);
+ xi_f = round(xi_f);
+ yi_f = round(yi_f);
+#endif // ALIGN_CORNERS
+
+ const int xi0 = clamp((int)xi_f, 0, _ISRC_WIDTH - 1);
+ const int yi0 = clamp((int)yi_f, 0, _ISRC_HEIGHT - 1);
+
+ TILE(SRC_DATA_TYPE, 1, N0, in00);
+
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in00);
+
+ TILE(uint, 1, 1, dst_indirect_y);
- *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT)));
+ // Calculate the destination indirect Y
+ dst_indirect_y[0].v = xo + (yo * (int)(_IDST_WIDTH)) + bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+
+ bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+ T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, in00, dst_indirect_y);
}
-/** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
+//! @cond Doxygen_Suppress
+/** Performs scale on a tensor by interpolating with the BILINEAR method. (NHWC)
*
- * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
* @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
- * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
- * @note The value to be used at the edges of the images shoud be given as a preprocessor argument using -DCONSTANT_VALUE=value.
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
+ * @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0)
+ * @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time
+ * @note The scale value to apply on the source width must be passed at compile using -DSCALE_X (e.g., -DSCALE_X=0.5)
+ * @note The scale value to apply on the source height must be passed at compile using -DSCALE_Y (e.g., -DSCALE_Y=0.5)
+ * @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time
*
- * @param[in] in_ptr Pointer to the source image. Supported data types: U8/S16/F16/F32.
- * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in_stride_z Stride of the source image in Z dimension (in bytes)
- * @param[in] in_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in_ptr
- * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] out_stride_z Stride of the destination image in Z dimension (in bytes)
- * @param[in] out_step_z dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @note In case of QASYMM8, the following extra information must be passed at compile time:
+ * - The source offset e.g. -DOFFSET=4
+ * - The source scale e.g. -DSCALE=4
*
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
+ //! @endcond
__kernel void scale_bilinear_nhwc(
- TENSOR4D_DECLARATION(in),
- TENSOR4D_DECLARATION(out))
+ TENSOR4D(src, SRC_TENSOR_TYPE),
+ TENSOR4D(dst, DST_TENSOR_TYPE))
{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
+ // All the tensor dimensions are passed at compile time.
+ // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _ISCALE_X SCALE_X
+#define _ISCALE_Y SCALE_Y
+
+ const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+ const int xo = GET_SPATIAL_IDX(1, 1, 0); // WIDTH
+#if defined(BATCHED_EXECUTION)
+ const int yo = GET_SPATIAL_IDX(2, 1, 0) % _IDST_HEIGHT; // HEIGHT
+ const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX
+#else // defined(BATCHED_EXECUTION)
+ const int yo = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
+ const int bout = 0; // BATCH SIZE IDX
+#endif // defined(BATCHED_EXECUTION)
#ifdef SAMPLING_POLICY_TOP_LEFT
- const float new_x = get_global_id(1) * SCALE_X;
- const float new_y = (get_global_id(2) % DEPTH_OUT) * SCALE_Y;
+ float xi_f = (xo * (float)SCALE_X);
+ float yi_f = (yo * (float)SCALE_Y);
#elif SAMPLING_POLICY_CENTER
- const float new_x = (get_global_id(1) + 0.5f) * SCALE_X - 0.5f;
- const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * SCALE_Y - 0.5f;
-#else /* SAMPLING_POLICY */
+ float xi_f = ((xo + 0.5f) * (float)SCALE_X - 0.5f);
+ float yi_f = ((yo + 0.5f) * (float)SCALE_Y - 0.5f);
+#else // SAMPLING_POLICY
#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
+#endif // SAMPLING_POLICY
+
+ const int xi = (int)floor(xi_f);
+ const int yi = (int)floor(yi_f);
- const float new_xf = floor(new_x);
- const float new_yf = floor(new_y);
- const float clamped_x = clamp(new_xf, 0.0f, SRC_WIDTH - 1.f);
- const float clamped_x1 = clamp(new_xf + 1, 0.0f, SRC_WIDTH - 1.f);
- const float clamped_y = clamp(new_yf, 0.0f, SRC_HEIGHT - 1.f);
- const float clamped_y1 = clamp(new_yf + 1, 0.0f, SRC_HEIGHT - 1.f);
+ TILE(SRC_DATA_TYPE, 1, N0, in00);
+ TILE(SRC_DATA_TYPE, 1, N0, in01);
+ TILE(SRC_DATA_TYPE, 1, N0, in10);
+ TILE(SRC_DATA_TYPE, 1, N0, in11);
-#if defined(OFFSET) && defined(SCALE)
-#define IN_DATA_TYPE int
-#else // defined(OFFSET) && defined(SCALE)
-#define IN_DATA_TYPE float
-#endif // defined(OFFSET) && defined(SCALE)
+ // Initialize the tiles to CONSTANT_VALUE
+ in00[0].v = CONSTANT_VALUE;
+ in01[0].v = CONSTANT_VALUE;
+ in10[0].v = CONSTANT_VALUE;
+ in11[0].v = CONSTANT_VALUE;
#ifndef BORDER_MODE_REPLICATE
- const bool check_x = (0.f <= new_xf && new_xf < (float)SRC_WIDTH);
- const bool check_x1 = (-1.f <= new_xf && new_xf < SRC_WIDTH - 1.f);
- const bool check_y = (0.f <= new_yf && new_yf < (float)SRC_HEIGHT);
- const bool check_y1 = (-1.f <= new_yf && new_yf < SRC_HEIGHT - 1.f);
-
- const IN_DATA_TYPE ins_0 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
- (get_global_id(2) / DEPTH_OUT)))),
- check_x && check_y);
- const IN_DATA_TYPE ins_1 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y),
- (get_global_id(2) / DEPTH_OUT)))),
- check_x1 && check_y);
- const IN_DATA_TYPE ins_2 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1),
- (get_global_id(2) / DEPTH_OUT)))),
- check_x && check_y1);
- const IN_DATA_TYPE ins_3 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1),
- (get_global_id(2) / DEPTH_OUT)))),
- check_x1 && check_y1);
- VEC_DATA_TYPE(IN_DATA_TYPE, 4)
- ins = (VEC_DATA_TYPE(IN_DATA_TYPE, 4))(ins_0, ins_1, ins_2, ins_3);
-#else /* BORDER_MODE_REPLICATE */
- VEC_DATA_TYPE(IN_DATA_TYPE, 4)
- ins = (VEC_DATA_TYPE(IN_DATA_TYPE, 4))(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
- *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
- *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
- *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
-#endif /* BORDER_MODE_REPLICATE */
-
- const float a = new_x - new_xf;
- const float b = 1.f - a;
- const float a1 = new_y - new_yf;
- const float b1 = 1.f - a1;
-
-#if defined(OFFSET) && defined(SCALE)
- const float4 insf32 = convert_float4(ins - (int4)OFFSET) * (float4)SCALE;
- const float fr = ((insf32.s0 * b * b1) + (insf32.s1 * a * b1) + (insf32.s2 * b * a1) + (insf32.s3 * a * a1));
- DATA_TYPE res = CONVERT_SAT(convert_int_sat_rtp(fr / SCALE) + OFFSET, DATA_TYPE);
-
- *((__global DATA_TYPE *)out.ptr) = res;
-#else // defined(OFFSET) && defined(SCALE)
- const float fr = ((ins.s0 * b * b1) + (ins.s1 * a * b1) + (ins.s2 * b * a1) + (ins.s3 * a * a1));
-
- *((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE);
-#endif // defined(OFFSET) && defined(SCALE)
-}
-#endif /* defined(DEPTH_OUT) */ \ No newline at end of file
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in00);
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi + 1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in01);
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in10);
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi + 1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in11);
+#else // BORDER_MODE_REPLICATE
+ const int xi0 = clamp(xi, 0, _ISRC_WIDTH - 1);
+ const int yi0 = clamp(yi, 0, _ISRC_HEIGHT - 1);
+ const int xi1 = clamp(xi + 1, 0, _ISRC_WIDTH - 1);
+ const int yi1 = clamp(yi + 1, 0, _ISRC_HEIGHT - 1);
+
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in00);
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in01);
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi0, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in10);
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in11);
+#endif // BORDER_MODE_REPLICATE
+
+ TILE(DST_DATA_TYPE, 1, N0, out);
+
+#if defined(IS_FLOATING_POINT)
+ const SRC_DATA_TYPE a = (SRC_DATA_TYPE)(xi_f - (float)xi);
+ const SRC_DATA_TYPE b = (SRC_DATA_TYPE)(1.f - a);
+ const SRC_DATA_TYPE a1 = (SRC_DATA_TYPE)(yi_f - (float)yi);
+ const SRC_DATA_TYPE b1 = (SRC_DATA_TYPE)(1.f - a1);
+
+ // Calculate the output
+ out[0].v = ((in00[0].v * b * b1) + (in01[0].v * a * b1) + (in10[0].v * b * a1) + (in11[0].v * a * a1));
+#else // defined(IS_FLOATING_POINT)
+ TILE(float, 1, N0, out_f);
+ TILE(float, 1, N0, in00_f);
+ TILE(float, 1, N0, in01_f);
+ TILE(float, 1, N0, in10_f);
+ TILE(float, 1, N0, in11_f);
+
+ const float a = (xi_f - (float)xi);
+ const float b = (1.f - a);
+ const float a1 = (yi_f - (float)yi);
+ const float b1 = (1.f - a1);
+
+ // Dequantize
+ LOOP_UNROLLING(int, n0, 0, 1, N0,
+ {
+ in00_f[0].s[n0] = ((float)in00[0].s[n0] - (float)OFFSET) * (float)SCALE;
+ in01_f[0].s[n0] = ((float)in01[0].s[n0] - (float)OFFSET) * (float)SCALE;
+ in10_f[0].s[n0] = ((float)in10[0].s[n0] - (float)OFFSET) * (float)SCALE;
+ in11_f[0].s[n0] = ((float)in11[0].s[n0] - (float)OFFSET) * (float)SCALE;
+ })
+
+ // Calculate the output in the floating-point domain
+ out_f[0].v = ((in00_f[0].v * b * b1) + (in01_f[0].v * a * b1) + (in10_f[0].v * b * a1) + (in11_f[0].v * a * a1));
+
+ // Quantize
+ LOOP_UNROLLING(int, n0, 0, 1, N0,
+ {
+ out[0].s[n0] = CONVERT_SAT(out_f[0].s[n0] / (float)SCALE + (float)OFFSET, DST_DATA_TYPE);
+ })
+#endif // defined(IS_FLOATING_POINT)
+
+ TILE(uint, 1, 1, dst_indirect_y);
+
+ // Calculate the destination indirect Y
+ dst_indirect_y[0].v = xo + (yo * (int)(_IDST_WIDTH)) + bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+
+ bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+ T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, out, dst_indirect_y);
+} \ No newline at end of file
diff --git a/src/gpu/cl/kernels/ClScaleKernel.cpp b/src/gpu/cl/kernels/ClScaleKernel.cpp
index 9307f7d4fb..d63c0e1754 100644
--- a/src/gpu/cl/kernels/ClScaleKernel.cpp
+++ b/src/gpu/cl/kernels/ClScaleKernel.cpp
@@ -114,33 +114,82 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
// Create kernel
const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
const unsigned int src_width = src->dimension(idx_width);
const unsigned int src_height = src->dimension(idx_height);
+ const unsigned int src_channel = src->dimension(idx_channel);
const unsigned int dst_width = dst->dimension(idx_width);
- const unsigned int vec_size = adjust_vec_size(is_nhwc ? 1 : 4, dst_width);
- const unsigned int vec_size_leftover = (dst_width % vec_size);
+ const unsigned int dst_height = dst->dimension(idx_height);
+ const unsigned int dst_channels = dst->dimension(idx_channel);
+ unsigned int vec_size = 0;
+ unsigned int vec_size_leftover = 0;
CLBuildOptions build_opts;
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
- build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type()));
- build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width));
- build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height));
- build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x));
- build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y));
-
- build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
- build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
- build_opts.add_option_if(!is_nhwc, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
- build_opts.add_option_if(!is_nhwc, "-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0) ? support::cpp11::to_string(vec_size) : support::cpp11::to_string(vec_size_leftover)));
- build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(dst->dimension(2)));
- build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
- build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
- if(is_qasymm_bilinear)
+ if(_data_layout == DataLayout::NHWC)
{
- const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
- build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale));
- build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset));
+ vec_size = adjust_vec_size(src->data_type() == DataType::F32 ? 4 : 8, dst_channels);
+ vec_size_leftover = dst_channels % vec_size;
+ build_opts.add_option("-DSRC_TENSOR_TYPE=BUFFER");
+ build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width));
+ build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height));
+ build_opts.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src_channel));
+ build_opts.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+ build_opts.add_option("-DDST_TENSOR_TYPE=BUFFER");
+ build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst_width));
+ build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst_height));
+ build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst_channels));
+ build_opts.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+ build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type()));
+ build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x));
+ build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y));
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(vec_size));
+ build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(vec_size_leftover));
+ build_opts.add_option_if(src->num_dimensions() > 3, "-DBATCHED_EXECUTION");
+ build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
+ build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
+ build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
+ build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOATING_POINT");
+ build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
+ if(is_qasymm_bilinear)
+ {
+ const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
+ build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale));
+ build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset));
+ }
+ else
+ {
+ build_opts.add_option("-DSCALE=" + support::cpp11::to_string(1));
+ build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(0));
+ }
}
+ else if(_data_layout == DataLayout::NCHW)
+ {
+ vec_size = adjust_vec_size(4, dst_width);
+ vec_size_leftover = dst_width % vec_size;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+ build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type()));
+ build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width));
+ build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height));
+ build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x));
+ build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0) ? support::cpp11::to_string(vec_size) : support::cpp11::to_string(vec_size_leftover)));
+ build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
+ build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
+ build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
+ build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
+ if(is_qasymm_bilinear)
+ {
+ const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
+ build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale));
+ build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset));
+ }
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR_ON("Unsupported data layout");
+ }
+
std::string interpolation_name = string_from_interpolation_policy(interpolation_policy_to_use);
std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
std::string kernel_name = "scale_" + interpolation_name + "_";