From 7bc1a778520f3a8477d88819faeaca8ff77859fa Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Wed, 8 Sep 2021 17:14:19 +0100 Subject: Optimize ClScaleKernel on NHWC (f32/f16/int8) The new kernel performs the computation on multiples elements. The OpenCL kernel has been re-implemented using the new TILE macros Resolves COMPMID-4803,COMPMID-4804 Change-Id: Iac8fead65e21b64567a05dbc4fbaa61d362443f9 Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6235 Tested-by: Arm Jenkins Reviewed-by: Giorgio Arena Comments-Addressed: Arm Jenkins --- src/core/CL/cl_kernels/nhwc/scale.cl | 345 +++++++++++++++++++++++------------ src/gpu/cl/kernels/ClScaleKernel.cpp | 89 +++++++-- 2 files changed, 293 insertions(+), 141 deletions(-) diff --git a/src/core/CL/cl_kernels/nhwc/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl index 69cbbcd5f3..21579aed9f 100644 --- a/src/core/CL/cl_kernels/nhwc/scale.cl +++ b/src/core/CL/cl_kernels/nhwc/scale.cl @@ -22,154 +22,257 @@ * SOFTWARE. */ #include "helpers.h" +#include "tile_helpers.h" -#if defined(DEPTH_OUT) -/** Performs scale on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel F32. (NHWC) +//! @cond Doxygen_Suppress +/** Performs scale on a tensor by interpolating with the NEAREAST NEIGHBOUR method. (NHWC) * * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT - * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16 + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64) + * @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER) + * @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER) + * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float) + * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float) + * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2) + * @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0) + * @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time + * @note The scale value to apply on the source width must be passed at compile using -DSCALE_X (e.g., -DSCALE_X=0.5) + * @note The scale value to apply on the source height must be passed at compile using -DSCALE_Y (e.g., -DSCALE_Y=0.5) + * @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time * - * @param[in] in_ptr Pointer to the source image. Supported data types: U8/S16/F16/F32. - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in_stride_z Stride of the source image in Z dimension (in bytes) - * @param[in] in_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in_ptr - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_stride_z Stride of the destination image in Z dimension (in bytes) - * @param[in] out_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32. + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor */ + //! @endcond __kernel void scale_nearest_neighbour_nhwc( - TENSOR4D_DECLARATION(in), - TENSOR4D_DECLARATION(out)) + TENSOR4D(src, SRC_TENSOR_TYPE), + TENSOR4D(dst, DST_TENSOR_TYPE)) { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT); + // All the tensor dimensions are passed at compile time. + // In case of dynamic tensor support, the following dimensions should be passed as function argument. +#define _ISRC_WIDTH SRC_WIDTH +#define _ISRC_HEIGHT SRC_HEIGHT +#define _IDST_WIDTH DST_WIDTH +#define _IDST_HEIGHT DST_HEIGHT +#define _ISCALE_X SCALE_X +#define _ISCALE_Y SCALE_Y + + const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM + const int xo = GET_SPATIAL_IDX(1, 1, 0); // WIDTH +#if defined(BATCHED_EXECUTION) + const int yo = GET_SPATIAL_IDX(2, 1, 0) % _IDST_HEIGHT; // HEIGHT + const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX +#else // defined(BATCHED_EXECUTION) + const int yo = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT + const int bout = 0; // BATCH SIZE IDX +#endif // defined(BATCHED_EXECUTION) #ifdef SAMPLING_POLICY_TOP_LEFT - float new_x = get_global_id(1) * SCALE_X; - float new_y = (get_global_id(2) % DEPTH_OUT) * SCALE_Y; + float xi_f = (xo * (float)SCALE_X); + float yi_f = (yo * (float)SCALE_Y); #elif SAMPLING_POLICY_CENTER - float new_x = (get_global_id(1) + 0.5f) * SCALE_X; - float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * SCALE_Y; -#else /* SAMPLING_POLICY */ + float xi_f = ((xo + 0.5f) * (float)SCALE_X); + float yi_f = ((yo + 0.5f) * (float)SCALE_Y); +#else // SAMPLING_POLICY #error("Unsupported sampling policy"); -#endif /* SAMPLING_POLICY */ +#endif // SAMPLING_POLICY + #ifdef ALIGN_CORNERS - new_x = round(new_x); - new_y = round(new_y); -#endif /* ALIGN_CORNERS */ - const float clamped_x = clamp(new_x, 0.0f, (float)SRC_WIDTH - 1); - const float clamped_y = clamp(new_y, 0.0f, (float)SRC_HEIGHT - 1); + xi_f = round(xi_f); + yi_f = round(yi_f); +#endif // ALIGN_CORNERS + + const int xi0 = clamp((int)xi_f, 0, _ISRC_WIDTH - 1); + const int yi0 = clamp((int)yi_f, 0, _ISRC_HEIGHT - 1); + + TILE(SRC_DATA_TYPE, 1, N0, in00); + + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in00); + + TILE(uint, 1, 1, dst_indirect_y); - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))); + // Calculate the destination indirect Y + dst_indirect_y[0].v = xo + (yo * (int)(_IDST_WIDTH)) + bout * (int)(_IDST_WIDTH * _IDST_HEIGHT); + + bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; + + T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, in00, dst_indirect_y); } -/** Performs scale on an image interpolating with the BILINEAR method. (NHWC) +//! @cond Doxygen_Suppress +/** Performs scale on a tensor by interpolating with the BILINEAR method. (NHWC) * - * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE - * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16 - * @note The value to be used at the edges of the images shoud be given as a preprocessor argument using -DCONSTANT_VALUE=value. + * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64) + * @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER) + * @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER) + * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float) + * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float) + * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2) + * @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0) + * @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time + * @note The scale value to apply on the source width must be passed at compile using -DSCALE_X (e.g., -DSCALE_X=0.5) + * @note The scale value to apply on the source height must be passed at compile using -DSCALE_Y (e.g., -DSCALE_Y=0.5) + * @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time * - * @param[in] in_ptr Pointer to the source image. Supported data types: U8/S16/F16/F32. - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in_stride_z Stride of the source image in Z dimension (in bytes) - * @param[in] in_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in_ptr - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_stride_z Stride of the destination image in Z dimension (in bytes) - * @param[in] out_step_z dst_stride_y * number of elements along Z processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @note In case of QASYMM8, the following extra information must be passed at compile time: + * - The source offset e.g. -DOFFSET=4 + * - The source scale e.g. -DSCALE=4 * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32. + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor */ + //! @endcond __kernel void scale_bilinear_nhwc( - TENSOR4D_DECLARATION(in), - TENSOR4D_DECLARATION(out)) + TENSOR4D(src, SRC_TENSOR_TYPE), + TENSOR4D(dst, DST_TENSOR_TYPE)) { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT); + // All the tensor dimensions are passed at compile time. + // In case of dynamic tensor support, the following dimensions should be passed as function argument. +#define _ISRC_WIDTH SRC_WIDTH +#define _ISRC_HEIGHT SRC_HEIGHT +#define _IDST_WIDTH DST_WIDTH +#define _IDST_HEIGHT DST_HEIGHT +#define _ISCALE_X SCALE_X +#define _ISCALE_Y SCALE_Y + + const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM + const int xo = GET_SPATIAL_IDX(1, 1, 0); // WIDTH +#if defined(BATCHED_EXECUTION) + const int yo = GET_SPATIAL_IDX(2, 1, 0) % _IDST_HEIGHT; // HEIGHT + const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX +#else // defined(BATCHED_EXECUTION) + const int yo = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT + const int bout = 0; // BATCH SIZE IDX +#endif // defined(BATCHED_EXECUTION) #ifdef SAMPLING_POLICY_TOP_LEFT - const float new_x = get_global_id(1) * SCALE_X; - const float new_y = (get_global_id(2) % DEPTH_OUT) * SCALE_Y; + float xi_f = (xo * (float)SCALE_X); + float yi_f = (yo * (float)SCALE_Y); #elif SAMPLING_POLICY_CENTER - const float new_x = (get_global_id(1) + 0.5f) * SCALE_X - 0.5f; - const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * SCALE_Y - 0.5f; -#else /* SAMPLING_POLICY */ + float xi_f = ((xo + 0.5f) * (float)SCALE_X - 0.5f); + float yi_f = ((yo + 0.5f) * (float)SCALE_Y - 0.5f); +#else // SAMPLING_POLICY #error("Unsupported sampling policy"); -#endif /* SAMPLING_POLICY */ +#endif // SAMPLING_POLICY + + const int xi = (int)floor(xi_f); + const int yi = (int)floor(yi_f); - const float new_xf = floor(new_x); - const float new_yf = floor(new_y); - const float clamped_x = clamp(new_xf, 0.0f, SRC_WIDTH - 1.f); - const float clamped_x1 = clamp(new_xf + 1, 0.0f, SRC_WIDTH - 1.f); - const float clamped_y = clamp(new_yf, 0.0f, SRC_HEIGHT - 1.f); - const float clamped_y1 = clamp(new_yf + 1, 0.0f, SRC_HEIGHT - 1.f); + TILE(SRC_DATA_TYPE, 1, N0, in00); + TILE(SRC_DATA_TYPE, 1, N0, in01); + TILE(SRC_DATA_TYPE, 1, N0, in10); + TILE(SRC_DATA_TYPE, 1, N0, in11); -#if defined(OFFSET) && defined(SCALE) -#define IN_DATA_TYPE int -#else // defined(OFFSET) && defined(SCALE) -#define IN_DATA_TYPE float -#endif // defined(OFFSET) && defined(SCALE) + // Initialize the tiles to CONSTANT_VALUE + in00[0].v = CONSTANT_VALUE; + in01[0].v = CONSTANT_VALUE; + in10[0].v = CONSTANT_VALUE; + in11[0].v = CONSTANT_VALUE; #ifndef BORDER_MODE_REPLICATE - const bool check_x = (0.f <= new_xf && new_xf < (float)SRC_WIDTH); - const bool check_x1 = (-1.f <= new_xf && new_xf < SRC_WIDTH - 1.f); - const bool check_y = (0.f <= new_yf && new_yf < (float)SRC_HEIGHT); - const bool check_y1 = (-1.f <= new_yf && new_yf < SRC_HEIGHT - 1.f); - - const IN_DATA_TYPE ins_0 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), - (get_global_id(2) / DEPTH_OUT)))), - check_x && check_y); - const IN_DATA_TYPE ins_1 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), - (get_global_id(2) / DEPTH_OUT)))), - check_x1 && check_y); - const IN_DATA_TYPE ins_2 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), - (get_global_id(2) / DEPTH_OUT)))), - check_x && check_y1); - const IN_DATA_TYPE ins_3 = select((IN_DATA_TYPE)(CONSTANT_VALUE), (IN_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), - (get_global_id(2) / DEPTH_OUT)))), - check_x1 && check_y1); - VEC_DATA_TYPE(IN_DATA_TYPE, 4) - ins = (VEC_DATA_TYPE(IN_DATA_TYPE, 4))(ins_0, ins_1, ins_2, ins_3); -#else /* BORDER_MODE_REPLICATE */ - VEC_DATA_TYPE(IN_DATA_TYPE, 4) - ins = (VEC_DATA_TYPE(IN_DATA_TYPE, 4))(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), - *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), - *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))), - *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT)))); -#endif /* BORDER_MODE_REPLICATE */ - - const float a = new_x - new_xf; - const float b = 1.f - a; - const float a1 = new_y - new_yf; - const float b1 = 1.f - a1; - -#if defined(OFFSET) && defined(SCALE) - const float4 insf32 = convert_float4(ins - (int4)OFFSET) * (float4)SCALE; - const float fr = ((insf32.s0 * b * b1) + (insf32.s1 * a * b1) + (insf32.s2 * b * a1) + (insf32.s3 * a * a1)); - DATA_TYPE res = CONVERT_SAT(convert_int_sat_rtp(fr / SCALE) + OFFSET, DATA_TYPE); - - *((__global DATA_TYPE *)out.ptr) = res; -#else // defined(OFFSET) && defined(SCALE) - const float fr = ((ins.s0 * b * b1) + (ins.s1 * a * b1) + (ins.s2 * b * a1) + (ins.s3 * a * a1)); - - *((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE); -#endif // defined(OFFSET) && defined(SCALE) -} -#endif /* defined(DEPTH_OUT) */ \ No newline at end of file + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in00); + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi + 1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in01); + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in10); + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi + 1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in11); +#else // BORDER_MODE_REPLICATE + const int xi0 = clamp(xi, 0, _ISRC_WIDTH - 1); + const int yi0 = clamp(yi, 0, _ISRC_HEIGHT - 1); + const int xi1 = clamp(xi + 1, 0, _ISRC_WIDTH - 1); + const int yi1 = clamp(yi + 1, 0, _ISRC_HEIGHT - 1); + + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in00); + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in01); + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi0, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in10); + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in11); +#endif // BORDER_MODE_REPLICATE + + TILE(DST_DATA_TYPE, 1, N0, out); + +#if defined(IS_FLOATING_POINT) + const SRC_DATA_TYPE a = (SRC_DATA_TYPE)(xi_f - (float)xi); + const SRC_DATA_TYPE b = (SRC_DATA_TYPE)(1.f - a); + const SRC_DATA_TYPE a1 = (SRC_DATA_TYPE)(yi_f - (float)yi); + const SRC_DATA_TYPE b1 = (SRC_DATA_TYPE)(1.f - a1); + + // Calculate the output + out[0].v = ((in00[0].v * b * b1) + (in01[0].v * a * b1) + (in10[0].v * b * a1) + (in11[0].v * a * a1)); +#else // defined(IS_FLOATING_POINT) + TILE(float, 1, N0, out_f); + TILE(float, 1, N0, in00_f); + TILE(float, 1, N0, in01_f); + TILE(float, 1, N0, in10_f); + TILE(float, 1, N0, in11_f); + + const float a = (xi_f - (float)xi); + const float b = (1.f - a); + const float a1 = (yi_f - (float)yi); + const float b1 = (1.f - a1); + + // Dequantize + LOOP_UNROLLING(int, n0, 0, 1, N0, + { + in00_f[0].s[n0] = ((float)in00[0].s[n0] - (float)OFFSET) * (float)SCALE; + in01_f[0].s[n0] = ((float)in01[0].s[n0] - (float)OFFSET) * (float)SCALE; + in10_f[0].s[n0] = ((float)in10[0].s[n0] - (float)OFFSET) * (float)SCALE; + in11_f[0].s[n0] = ((float)in11[0].s[n0] - (float)OFFSET) * (float)SCALE; + }) + + // Calculate the output in the floating-point domain + out_f[0].v = ((in00_f[0].v * b * b1) + (in01_f[0].v * a * b1) + (in10_f[0].v * b * a1) + (in11_f[0].v * a * a1)); + + // Quantize + LOOP_UNROLLING(int, n0, 0, 1, N0, + { + out[0].s[n0] = CONVERT_SAT(out_f[0].s[n0] / (float)SCALE + (float)OFFSET, DST_DATA_TYPE); + }) +#endif // defined(IS_FLOATING_POINT) + + TILE(uint, 1, 1, dst_indirect_y); + + // Calculate the destination indirect Y + dst_indirect_y[0].v = xo + (yo * (int)(_IDST_WIDTH)) + bout * (int)(_IDST_WIDTH * _IDST_HEIGHT); + + bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; + + T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, out, dst_indirect_y); +} \ No newline at end of file diff --git a/src/gpu/cl/kernels/ClScaleKernel.cpp b/src/gpu/cl/kernels/ClScaleKernel.cpp index 9307f7d4fb..d63c0e1754 100644 --- a/src/gpu/cl/kernels/ClScaleKernel.cpp +++ b/src/gpu/cl/kernels/ClScaleKernel.cpp @@ -114,33 +114,82 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn // Create kernel const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); const unsigned int src_width = src->dimension(idx_width); const unsigned int src_height = src->dimension(idx_height); + const unsigned int src_channel = src->dimension(idx_channel); const unsigned int dst_width = dst->dimension(idx_width); - const unsigned int vec_size = adjust_vec_size(is_nhwc ? 1 : 4, dst_width); - const unsigned int vec_size_leftover = (dst_width % vec_size); + const unsigned int dst_height = dst->dimension(idx_height); + const unsigned int dst_channels = dst->dimension(idx_channel); + unsigned int vec_size = 0; + unsigned int vec_size_leftover = 0; CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); - build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type())); - build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width)); - build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height)); - build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x)); - build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y)); - - build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE"); - build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT"); - build_opts.add_option_if(!is_nhwc, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size)); - build_opts.add_option_if(!is_nhwc, "-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0) ? support::cpp11::to_string(vec_size) : support::cpp11::to_string(vec_size_leftover))); - build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(dst->dimension(2))); - build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT"); - build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS"); - if(is_qasymm_bilinear) + if(_data_layout == DataLayout::NHWC) { - const UniformQuantizationInfo qinfo = src->quantization_info().uniform(); - build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale)); - build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset)); + vec_size = adjust_vec_size(src->data_type() == DataType::F32 ? 4 : 8, dst_channels); + vec_size_leftover = dst_channels % vec_size; + build_opts.add_option("-DSRC_TENSOR_TYPE=BUFFER"); + build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width)); + build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height)); + build_opts.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src_channel)); + build_opts.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); + build_opts.add_option("-DDST_TENSOR_TYPE=BUFFER"); + build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst_width)); + build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst_height)); + build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst_channels)); + build_opts.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); + build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type())); + build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x)); + build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(vec_size)); + build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(vec_size_leftover)); + build_opts.add_option_if(src->num_dimensions() > 3, "-DBATCHED_EXECUTION"); + build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE"); + build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT"); + build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS"); + build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOATING_POINT"); + build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT"); + if(is_qasymm_bilinear) + { + const UniformQuantizationInfo qinfo = src->quantization_info().uniform(); + build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale)); + build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset)); + } + else + { + build_opts.add_option("-DSCALE=" + support::cpp11::to_string(1)); + build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(0)); + } } + else if(_data_layout == DataLayout::NCHW) + { + vec_size = adjust_vec_size(4, dst_width); + vec_size_leftover = dst_width % vec_size; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); + build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type())); + build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width)); + build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height)); + build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x)); + build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y)); + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0) ? support::cpp11::to_string(vec_size) : support::cpp11::to_string(vec_size_leftover))); + build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE"); + build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT"); + build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS"); + build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT"); + if(is_qasymm_bilinear) + { + const UniformQuantizationInfo qinfo = src->quantization_info().uniform(); + build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale)); + build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset)); + } + } + else + { + ARM_COMPUTE_ERROR_ON("Unsupported data layout"); + } + std::string interpolation_name = string_from_interpolation_policy(interpolation_policy_to_use); std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower); std::string kernel_name = "scale_" + interpolation_name + "_"; -- cgit v1.2.1