From 17975a61c5d7cbdc37c11d38e23eab8afa43f27c Mon Sep 17 00:00:00 2001 From: Adnan AlSinan Date: Mon, 8 Nov 2021 17:46:39 +0000 Subject: Improve start-up time for ClScale - Add macro guard for different kernels in scale.cl - Rework TENSOR4D to the new format - Pass scale_x and scale_y at runtime Resolves COMPMID-4886 Signed-off-by: Adnan AlSinan Change-Id: Ib904a703d511fb8260618057ac92e5ea9efeee2b Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6619 Reviewed-by: Gian Marco Iodice Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- src/core/CL/ICLKernel.cpp | 27 +++++ src/core/CL/ICLKernel.h | 18 ++++ src/core/CL/cl_kernels/nhwc/scale.cl | 196 +++++++++++++++------------------- src/core/CL/cl_kernels/tile_helpers.h | 26 +++++ src/gpu/cl/kernels/ClScaleKernel.cpp | 22 ++-- 5 files changed, 170 insertions(+), 119 deletions(-) diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp index 9ba17d0e03..eb750cbd34 100644 --- a/src/core/CL/ICLKernel.cpp +++ b/src/core/CL/ICLKernel.cpp @@ -116,6 +116,33 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons ARM_COMPUTE_UNUSED(idx_start); } +void ICLKernel::add_4d_tensor_nhwc_argument(unsigned int &idx, const ICLTensor *tensor) +{ + ARM_COMPUTE_ERROR_ON(tensor == nullptr); + + const ITensorInfo *info = tensor->info(); + ARM_COMPUTE_ERROR_ON(info == nullptr); + const Strides &strides = info->strides_in_bytes(); + + // Tensor poniter + _kernel.setArg(idx++, tensor->cl_buffer()); + + // Add stride_y, stride_z and stride_w + _kernel.setArg(idx++, strides[1]); + _kernel.setArg(idx++, strides[2]); + _kernel.setArg(idx++, strides[3]); + + // Tensor dimensions + _kernel.setArg(idx++, info->dimension(0)); + _kernel.setArg(idx++, info->dimension(1)); + _kernel.setArg(idx++, info->dimension(2)); + _kernel.setArg(idx++, info->dimension(3)); + + // Offset of first element + unsigned int offset_first_element = info->offset_first_element_in_bytes(); + _kernel.setArg(idx++, offset_first_element); +} + #ifndef DOXYGEN_SKIP_THIS template void ICLKernel::add_tensor_argument<1>(unsigned &idx, const ICLTensor *tensor, const Window &window); template void ICLKernel::add_tensor_argument<2>(unsigned &idx, const ICLTensor *tensor, const Window &window); diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h index 3b3217d1d8..a7c979ef45 100644 --- a/src/core/CL/ICLKernel.h +++ b/src/core/CL/ICLKernel.h @@ -225,6 +225,24 @@ public: { add_tensor_argument<4>(idx, tensor, window); } + + /** Add the passed NHWC 4D tensor's parameters to the object's kernel's arguments by passing strides, dimensions and the offset to the first valid element in bytes. + * + * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set. + * @param[in] tensor Tensor to set as an argument of the object's kernel. + */ + void add_4d_tensor_nhwc_argument(unsigned int &idx, const ICLTensor *tensor); + + /** Returns the number of arguments enqueued per NHWC 4D Tensor object. + * + * @return The number of arguments enqueued per NHWC 4D Tensor object. + */ + constexpr static unsigned int num_arguments_per_4d_tensor_nhwc() + { + constexpr unsigned int no_args_per_4d_tensor_nhwc = 9u; + return no_args_per_4d_tensor_nhwc; + } + /** Returns the number of arguments enqueued per 1D array object. * * @return The number of arguments enqueues per 1D array object. diff --git a/src/core/CL/cl_kernels/nhwc/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl index 21579aed9f..bccfd6543a 100644 --- a/src/core/CL/cl_kernels/nhwc/scale.cl +++ b/src/core/CL/cl_kernels/nhwc/scale.cl @@ -24,12 +24,11 @@ #include "helpers.h" #include "tile_helpers.h" +#if defined(SCALE_NEAREST_NEIGHBOUR) //! @cond Doxygen_Suppress /** Performs scale on a tensor by interpolating with the NEAREAST NEIGHBOUR method. (NHWC) * * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT - * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) - * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64) * @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER) * @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER) * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float) @@ -37,61 +36,52 @@ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2) * @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0) * @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time - * @note The scale value to apply on the source width must be passed at compile using -DSCALE_X (e.g., -DSCALE_X=0.5) - * @note The scale value to apply on the source height must be passed at compile using -DSCALE_Y (e.g., -DSCALE_Y=0.5) * @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32. - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32. + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_c The size of the channels dimension of the source tensor + * @param[in] src_w The size of the width dimension of the source tensor + * @param[in] src_h The size of the height dimension of the source tensor + * @param[in] src_n The size of the batches dimension of the source tensor + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: U8/S16/F16/F32. + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_c The size of the channels dimension of the destination tensor + * @param[in] dst_w The size of the width dimension of the destination tensor + * @param[in] dst_h The size of the height dimension of the destination tensor + * @param[in] dst_n The size of the batches dimension of the destination tensor + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] scale_x The scale value to apply on the source width + * @param[in] scale_y The scale value to apply on the source height */ - //! @endcond +//! @endcond __kernel void scale_nearest_neighbour_nhwc( - TENSOR4D(src, SRC_TENSOR_TYPE), - TENSOR4D(dst, DST_TENSOR_TYPE)) + TENSOR4D_T(src, SRC_TENSOR_TYPE), + TENSOR4D_T(dst, DST_TENSOR_TYPE), + const float scale_x, + const float scale_y) { - // All the tensor dimensions are passed at compile time. - // In case of dynamic tensor support, the following dimensions should be passed as function argument. -#define _ISRC_WIDTH SRC_WIDTH -#define _ISRC_HEIGHT SRC_HEIGHT -#define _IDST_WIDTH DST_WIDTH -#define _IDST_HEIGHT DST_HEIGHT -#define _ISCALE_X SCALE_X -#define _ISCALE_Y SCALE_Y - const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM const int xo = GET_SPATIAL_IDX(1, 1, 0); // WIDTH #if defined(BATCHED_EXECUTION) - const int yo = GET_SPATIAL_IDX(2, 1, 0) % _IDST_HEIGHT; // HEIGHT - const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX -#else // defined(BATCHED_EXECUTION) + const int yo = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT + const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX +#else // defined(BATCHED_EXECUTION) const int yo = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT - const int bout = 0; // BATCH SIZE IDX -#endif // defined(BATCHED_EXECUTION) + const int bout = 0; // BATCH SIZE IDX +#endif // defined(BATCHED_EXECUTION) #ifdef SAMPLING_POLICY_TOP_LEFT - float xi_f = (xo * (float)SCALE_X); - float yi_f = (yo * (float)SCALE_Y); + float xi_f = (xo * scale_x); + float yi_f = (yo * scale_y); #elif SAMPLING_POLICY_CENTER - float xi_f = ((xo + 0.5f) * (float)SCALE_X); - float yi_f = ((yo + 0.5f) * (float)SCALE_Y); + float xi_f = ((xo + 0.5f) * scale_x); + float yi_f = ((yo + 0.5f) * scale_y); #else // SAMPLING_POLICY #error("Unsupported sampling policy"); #endif // SAMPLING_POLICY @@ -101,30 +91,30 @@ __kernel void scale_nearest_neighbour_nhwc( yi_f = round(yi_f); #endif // ALIGN_CORNERS - const int xi0 = clamp((int)xi_f, 0, _ISRC_WIDTH - 1); - const int yi0 = clamp((int)yi_f, 0, _ISRC_HEIGHT - 1); + const int xi0 = clamp((int)xi_f, 0, (int)src_w - 1); + const int yi0 = clamp((int)yi_f, 0, (int)src_h - 1); TILE(SRC_DATA_TYPE, 1, N0, in00); - T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in00); + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, src_w, src_h, 1, 1, false, in00); TILE(uint, 1, 1, dst_indirect_y); // Calculate the destination indirect Y - dst_indirect_y[0].v = xo + (yo * (int)(_IDST_WIDTH)) + bout * (int)(_IDST_WIDTH * _IDST_HEIGHT); + dst_indirect_y[0].v = xo + (yo * (int)(dst_w)) + bout * (int)(dst_w * dst_h); bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, in00, dst_indirect_y); } +#endif /* SCALE_NEAREST_NEIGHBOUR */ +#if defined(SCALE_BILINEAR) //! @cond Doxygen_Suppress /** Performs scale on a tensor by interpolating with the BILINEAR method. (NHWC) * * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT - * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) - * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64) * @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER) * @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER) * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float) @@ -132,65 +122,56 @@ __kernel void scale_nearest_neighbour_nhwc( * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2) * @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0) * @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time - * @note The scale value to apply on the source width must be passed at compile using -DSCALE_X (e.g., -DSCALE_X=0.5) - * @note The scale value to apply on the source height must be passed at compile using -DSCALE_Y (e.g., -DSCALE_Y=0.5) * @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time * * @note In case of QASYMM8, the following extra information must be passed at compile time: * - The source offset e.g. -DOFFSET=4 * - The source scale e.g. -DSCALE=4 * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32. - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32. + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_c The size of the channels dimension of the source tensor + * @param[in] src_w The size of the width dimension of the source tensor + * @param[in] src_h The size of the height dimension of the source tensor + * @param[in] src_n The size of the batches dimension of the source tensor + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: U8/S16/F16/F32. + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_c The size of the channels dimension of the destination tensor + * @param[in] dst_w The size of the width dimension of the destination tensor + * @param[in] dst_h The size of the height dimension of the destination tensor + * @param[in] dst_n The size of the batches dimension of the destination tensor + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] scale_x The scale value to apply on the source width + * @param[in] scale_y The scale value to apply on the source height */ - //! @endcond +//! @endcond __kernel void scale_bilinear_nhwc( - TENSOR4D(src, SRC_TENSOR_TYPE), - TENSOR4D(dst, DST_TENSOR_TYPE)) + TENSOR4D_T(src, SRC_TENSOR_TYPE), + TENSOR4D_T(dst, DST_TENSOR_TYPE), + const float scale_x, + const float scale_y) { - // All the tensor dimensions are passed at compile time. - // In case of dynamic tensor support, the following dimensions should be passed as function argument. -#define _ISRC_WIDTH SRC_WIDTH -#define _ISRC_HEIGHT SRC_HEIGHT -#define _IDST_WIDTH DST_WIDTH -#define _IDST_HEIGHT DST_HEIGHT -#define _ISCALE_X SCALE_X -#define _ISCALE_Y SCALE_Y - const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM const int xo = GET_SPATIAL_IDX(1, 1, 0); // WIDTH #if defined(BATCHED_EXECUTION) - const int yo = GET_SPATIAL_IDX(2, 1, 0) % _IDST_HEIGHT; // HEIGHT - const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX -#else // defined(BATCHED_EXECUTION) + const int yo = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT + const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX +#else // defined(BATCHED_EXECUTION) const int yo = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT const int bout = 0; // BATCH SIZE IDX -#endif // defined(BATCHED_EXECUTION) +#endif // defined(BATCHED_EXECUTION) #ifdef SAMPLING_POLICY_TOP_LEFT - float xi_f = (xo * (float)SCALE_X); - float yi_f = (yo * (float)SCALE_Y); + float xi_f = (xo * scale_x); + float yi_f = (yo * scale_y); #elif SAMPLING_POLICY_CENTER - float xi_f = ((xo + 0.5f) * (float)SCALE_X - 0.5f); - float yi_f = ((yo + 0.5f) * (float)SCALE_Y - 0.5f); + float xi_f = ((xo + 0.5f) * scale_x - 0.5f); + float yi_f = ((yo + 0.5f) * scale_y - 0.5f); #else // SAMPLING_POLICY #error("Unsupported sampling policy"); #endif // SAMPLING_POLICY @@ -210,20 +191,20 @@ __kernel void scale_bilinear_nhwc( in11[0].v = CONSTANT_VALUE; #ifndef BORDER_MODE_REPLICATE - T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in00); - T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi + 1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in01); - T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in10); - T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi + 1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in11); + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi, cout, src_w, src_h, 1, 1, true, in00); + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi + 1, cout, src_w, src_h, 1, 1, true, in01); + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi, cout, src_w, src_h, 1, 1, true, in10); + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi + 1, cout, src_w, src_h, 1, 1, true, in11); #else // BORDER_MODE_REPLICATE - const int xi0 = clamp(xi, 0, _ISRC_WIDTH - 1); - const int yi0 = clamp(yi, 0, _ISRC_HEIGHT - 1); - const int xi1 = clamp(xi + 1, 0, _ISRC_WIDTH - 1); - const int yi1 = clamp(yi + 1, 0, _ISRC_HEIGHT - 1); + const int xi0 = clamp(xi, 0, (int)src_w - 1); + const int yi0 = clamp(yi, 0, (int)src_h - 1); + const int xi1 = clamp(xi + 1, 0, (int)src_w - 1); + const int yi1 = clamp(yi + 1, 0, (int)src_h - 1); - T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in00); - T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in01); - T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi0, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in10); - T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in11); + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, src_w, src_h, 1, 1, false, in00); + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi1, cout, src_w, src_h, 1, 1, false, in01); + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi0, cout, src_w, src_h, 1, 1, false, in10); + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi1, cout, src_w, src_h, 1, 1, false, in11); #endif // BORDER_MODE_REPLICATE TILE(DST_DATA_TYPE, 1, N0, out); @@ -270,9 +251,10 @@ __kernel void scale_bilinear_nhwc( TILE(uint, 1, 1, dst_indirect_y); // Calculate the destination indirect Y - dst_indirect_y[0].v = xo + (yo * (int)(_IDST_WIDTH)) + bout * (int)(_IDST_WIDTH * _IDST_HEIGHT); + dst_indirect_y[0].v = xo + (yo * (int)(dst_w)) + bout * (int)(dst_w * dst_h); bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0; T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, out, dst_indirect_y); -} \ No newline at end of file +} +#endif /* SCALE_BILINEAR */ \ No newline at end of file diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h index f36f273e1d..cc20616867 100644 --- a/src/core/CL/cl_kernels/tile_helpers.h +++ b/src/core/CL/cl_kernels/tile_helpers.h @@ -104,6 +104,32 @@ #define TENSOR4D_STR(name, type) TENSOR4D_##type(name) #define TENSOR4D(name, type) TENSOR4D_STR(name, type) +#define TENSOR4D_T_IMAGE(name) \ + __read_only image2d_t name##_img, \ + __global uchar *name##_ptr, \ + uint name##_stride_y, \ + uint name##_stride_z, \ + uint name##_stride_w, \ + uint name##_c, \ + uint name##_w, \ + uint name##_h, \ + uint name##_n, \ + uint name##_offset_first_element_in_bytes + +#define TENSOR4D_T_BUFFER(name) \ + __global uchar *name##_ptr, \ + uint name##_stride_y, \ + uint name##_stride_z, \ + uint name##_stride_w, \ + uint name##_c, \ + uint name##_w, \ + uint name##_h, \ + uint name##_n, \ + uint name##_offset_first_element_in_bytes + +#define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name) +#define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type) + #if !defined(UNROLL_WITH_PRAGMA) #define UNROLL_INCR(idx, step, macro) idx += (step); (macro) diff --git a/src/gpu/cl/kernels/ClScaleKernel.cpp b/src/gpu/cl/kernels/ClScaleKernel.cpp index d63c0e1754..6f16adc657 100644 --- a/src/gpu/cl/kernels/ClScaleKernel.cpp +++ b/src/gpu/cl/kernels/ClScaleKernel.cpp @@ -117,9 +117,7 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); const unsigned int src_width = src->dimension(idx_width); const unsigned int src_height = src->dimension(idx_height); - const unsigned int src_channel = src->dimension(idx_channel); const unsigned int dst_width = dst->dimension(idx_width); - const unsigned int dst_height = dst->dimension(idx_height); const unsigned int dst_channels = dst->dimension(idx_channel); unsigned int vec_size = 0; unsigned int vec_size_leftover = 0; @@ -130,20 +128,13 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn vec_size = adjust_vec_size(src->data_type() == DataType::F32 ? 4 : 8, dst_channels); vec_size_leftover = dst_channels % vec_size; build_opts.add_option("-DSRC_TENSOR_TYPE=BUFFER"); - build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width)); - build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height)); - build_opts.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src_channel)); build_opts.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option("-DDST_TENSOR_TYPE=BUFFER"); - build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst_width)); - build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst_height)); - build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst_channels)); build_opts.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type())); - build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x)); - build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y)); build_opts.add_option("-DN0=" + support::cpp11::to_string(vec_size)); build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(vec_size_leftover)); + build_opts.add_option("-DSCALE_" + string_from_interpolation_policy(interpolation_policy_to_use)); build_opts.add_option_if(src->num_dimensions() > 3, "-DBATCHED_EXECUTION"); build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE"); build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT"); @@ -203,6 +194,13 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); + // Pass scale kernel arguments + if(is_nhwc) + { + unsigned int idx = 2 * num_arguments_per_4d_tensor_nhwc(); + _kernel.setArg(idx++, scale_x); + _kernel.setArg(idx++, scale_y); + } // Set config_id for enabling LWS tuning _config_id = "scale_"; _config_id += (info.border_mode == BorderMode::REPLICATE ? "Bord_rep" : ""); @@ -248,8 +246,8 @@ void ClScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma Window slice = collapsed.first_slice_window_4D(); unsigned int idx = 0; - add_4D_tensor_argument(idx, src, slice); - add_4D_tensor_argument(idx, dst, slice); + add_4d_tensor_nhwc_argument(idx, src); + add_4d_tensor_nhwc_argument(idx, dst); enqueue(queue, *this, slice, lws_hint()); break; } -- cgit v1.2.1