aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdnan AlSinan <adnan.alsinan@arm.com>2021-11-08 17:46:39 +0000
committerAdnan AlSinan <adnan.alsinan@arm.com>2021-11-09 14:42:08 +0000
commit17975a61c5d7cbdc37c11d38e23eab8afa43f27c (patch)
tree2b9a82377dc90399dfb1a2e9e70e2dabb9735987
parenta3b13b500e61a02d1e73148ccd97d054ef7c22ec (diff)
downloadComputeLibrary-17975a61c5d7cbdc37c11d38e23eab8afa43f27c.tar.gz
Improve start-up time for ClScale
- Add macro guard for different kernels in scale.cl - Rework TENSOR4D to the new format - Pass scale_x and scale_y at runtime Resolves COMPMID-4886 Signed-off-by: Adnan AlSinan <adnan.alsinan@arm.com> Change-Id: Ib904a703d511fb8260618057ac92e5ea9efeee2b Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6619 Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--src/core/CL/ICLKernel.cpp27
-rw-r--r--src/core/CL/ICLKernel.h18
-rw-r--r--src/core/CL/cl_kernels/nhwc/scale.cl196
-rw-r--r--src/core/CL/cl_kernels/tile_helpers.h26
-rw-r--r--src/gpu/cl/kernels/ClScaleKernel.cpp22
5 files changed, 170 insertions, 119 deletions
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 9ba17d0e03..eb750cbd34 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -116,6 +116,33 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons
ARM_COMPUTE_UNUSED(idx_start);
}
+void ICLKernel::add_4d_tensor_nhwc_argument(unsigned int &idx, const ICLTensor *tensor)
+{
+ ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+
+ const ITensorInfo *info = tensor->info();
+ ARM_COMPUTE_ERROR_ON(info == nullptr);
+ const Strides &strides = info->strides_in_bytes();
+
+ // Tensor poniter
+ _kernel.setArg(idx++, tensor->cl_buffer());
+
+ // Add stride_y, stride_z and stride_w
+ _kernel.setArg<cl_uint>(idx++, strides[1]);
+ _kernel.setArg<cl_uint>(idx++, strides[2]);
+ _kernel.setArg<cl_uint>(idx++, strides[3]);
+
+ // Tensor dimensions
+ _kernel.setArg<cl_uint>(idx++, info->dimension(0));
+ _kernel.setArg<cl_uint>(idx++, info->dimension(1));
+ _kernel.setArg<cl_uint>(idx++, info->dimension(2));
+ _kernel.setArg<cl_uint>(idx++, info->dimension(3));
+
+ // Offset of first element
+ unsigned int offset_first_element = info->offset_first_element_in_bytes();
+ _kernel.setArg<cl_uint>(idx++, offset_first_element);
+}
+
#ifndef DOXYGEN_SKIP_THIS
template void ICLKernel::add_tensor_argument<1>(unsigned &idx, const ICLTensor *tensor, const Window &window);
template void ICLKernel::add_tensor_argument<2>(unsigned &idx, const ICLTensor *tensor, const Window &window);
diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h
index 3b3217d1d8..a7c979ef45 100644
--- a/src/core/CL/ICLKernel.h
+++ b/src/core/CL/ICLKernel.h
@@ -225,6 +225,24 @@ public:
{
add_tensor_argument<4>(idx, tensor, window);
}
+
+ /** Add the passed NHWC 4D tensor's parameters to the object's kernel's arguments by passing strides, dimensions and the offset to the first valid element in bytes.
+ *
+ * @param[in,out] idx Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
+ * @param[in] tensor Tensor to set as an argument of the object's kernel.
+ */
+ void add_4d_tensor_nhwc_argument(unsigned int &idx, const ICLTensor *tensor);
+
+ /** Returns the number of arguments enqueued per NHWC 4D Tensor object.
+ *
+ * @return The number of arguments enqueued per NHWC 4D Tensor object.
+ */
+ constexpr static unsigned int num_arguments_per_4d_tensor_nhwc()
+ {
+ constexpr unsigned int no_args_per_4d_tensor_nhwc = 9u;
+ return no_args_per_4d_tensor_nhwc;
+ }
+
/** Returns the number of arguments enqueued per 1D array object.
*
* @return The number of arguments enqueues per 1D array object.
diff --git a/src/core/CL/cl_kernels/nhwc/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl
index 21579aed9f..bccfd6543a 100644
--- a/src/core/CL/cl_kernels/nhwc/scale.cl
+++ b/src/core/CL/cl_kernels/nhwc/scale.cl
@@ -24,12 +24,11 @@
#include "helpers.h"
#include "tile_helpers.h"
+#if defined(SCALE_NEAREST_NEIGHBOUR)
//! @cond Doxygen_Suppress
/** Performs scale on a tensor by interpolating with the NEAREAST NEIGHBOUR method. (NHWC)
*
* @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
* @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
* @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
* @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
@@ -37,61 +36,52 @@
* @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
* @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0)
* @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time
- * @note The scale value to apply on the source width must be passed at compile using -DSCALE_X (e.g., -DSCALE_X=0.5)
- * @note The scale value to apply on the source height must be passed at compile using -DSCALE_Y (e.g., -DSCALE_Y=0.5)
* @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_c The size of the channels dimension of the source tensor
+ * @param[in] src_w The size of the width dimension of the source tensor
+ * @param[in] src_h The size of the height dimension of the source tensor
+ * @param[in] src_n The size of the batches dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_c The size of the channels dimension of the destination tensor
+ * @param[in] dst_w The size of the width dimension of the destination tensor
+ * @param[in] dst_h The size of the height dimension of the destination tensor
+ * @param[in] dst_n The size of the batches dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] scale_x The scale value to apply on the source width
+ * @param[in] scale_y The scale value to apply on the source height
*/
- //! @endcond
+//! @endcond
__kernel void scale_nearest_neighbour_nhwc(
- TENSOR4D(src, SRC_TENSOR_TYPE),
- TENSOR4D(dst, DST_TENSOR_TYPE))
+ TENSOR4D_T(src, SRC_TENSOR_TYPE),
+ TENSOR4D_T(dst, DST_TENSOR_TYPE),
+ const float scale_x,
+ const float scale_y)
{
- // All the tensor dimensions are passed at compile time.
- // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _IDST_WIDTH DST_WIDTH
-#define _IDST_HEIGHT DST_HEIGHT
-#define _ISCALE_X SCALE_X
-#define _ISCALE_Y SCALE_Y
-
const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
const int xo = GET_SPATIAL_IDX(1, 1, 0); // WIDTH
#if defined(BATCHED_EXECUTION)
- const int yo = GET_SPATIAL_IDX(2, 1, 0) % _IDST_HEIGHT; // HEIGHT
- const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX
-#else // defined(BATCHED_EXECUTION)
+ const int yo = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT
+ const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX
+#else // defined(BATCHED_EXECUTION)
const int yo = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
- const int bout = 0; // BATCH SIZE IDX
-#endif // defined(BATCHED_EXECUTION)
+ const int bout = 0; // BATCH SIZE IDX
+#endif // defined(BATCHED_EXECUTION)
#ifdef SAMPLING_POLICY_TOP_LEFT
- float xi_f = (xo * (float)SCALE_X);
- float yi_f = (yo * (float)SCALE_Y);
+ float xi_f = (xo * scale_x);
+ float yi_f = (yo * scale_y);
#elif SAMPLING_POLICY_CENTER
- float xi_f = ((xo + 0.5f) * (float)SCALE_X);
- float yi_f = ((yo + 0.5f) * (float)SCALE_Y);
+ float xi_f = ((xo + 0.5f) * scale_x);
+ float yi_f = ((yo + 0.5f) * scale_y);
#else // SAMPLING_POLICY
#error("Unsupported sampling policy");
#endif // SAMPLING_POLICY
@@ -101,30 +91,30 @@ __kernel void scale_nearest_neighbour_nhwc(
yi_f = round(yi_f);
#endif // ALIGN_CORNERS
- const int xi0 = clamp((int)xi_f, 0, _ISRC_WIDTH - 1);
- const int yi0 = clamp((int)yi_f, 0, _ISRC_HEIGHT - 1);
+ const int xi0 = clamp((int)xi_f, 0, (int)src_w - 1);
+ const int yi0 = clamp((int)yi_f, 0, (int)src_h - 1);
TILE(SRC_DATA_TYPE, 1, N0, in00);
- T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in00);
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, src_w, src_h, 1, 1, false, in00);
TILE(uint, 1, 1, dst_indirect_y);
// Calculate the destination indirect Y
- dst_indirect_y[0].v = xo + (yo * (int)(_IDST_WIDTH)) + bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+ dst_indirect_y[0].v = xo + (yo * (int)(dst_w)) + bout * (int)(dst_w * dst_h);
bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, in00, dst_indirect_y);
}
+#endif /* SCALE_NEAREST_NEIGHBOUR */
+#if defined(SCALE_BILINEAR)
//! @cond Doxygen_Suppress
/** Performs scale on a tensor by interpolating with the BILINEAR method. (NHWC)
*
* @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
* @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
* @note The tensor type ("BUFFER" only is supported) of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
* @note The tensor type ("BUFFER" only is supported) of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
* @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
@@ -132,65 +122,56 @@ __kernel void scale_nearest_neighbour_nhwc(
* @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
* @note The border value value must be passed at compile time using -DCONSTANT_VALUE (e.g. -DCONSTANT_VALUE=0)
* @note In case of F32/F16, -DIS_FLOATING_POINT must be passed at compile time
- * @note The scale value to apply on the source width must be passed at compile using -DSCALE_X (e.g., -DSCALE_X=0.5)
- * @note The scale value to apply on the source height must be passed at compile using -DSCALE_Y (e.g., -DSCALE_Y=0.5)
* @note If the source tensor has more than 3 dimensions, -DBATCHED_EXECUTION must be passed at compile time
*
* @note In case of QASYMM8, the following extra information must be passed at compile time:
* - The source offset e.g. -DOFFSET=4
* - The source scale e.g. -DSCALE=4
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_c The size of the channels dimension of the source tensor
+ * @param[in] src_w The size of the width dimension of the source tensor
+ * @param[in] src_h The size of the height dimension of the source tensor
+ * @param[in] src_n The size of the batches dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: U8/S16/F16/F32.
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_c The size of the channels dimension of the destination tensor
+ * @param[in] dst_w The size of the width dimension of the destination tensor
+ * @param[in] dst_h The size of the height dimension of the destination tensor
+ * @param[in] dst_n The size of the batches dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] scale_x The scale value to apply on the source width
+ * @param[in] scale_y The scale value to apply on the source height
*/
- //! @endcond
+//! @endcond
__kernel void scale_bilinear_nhwc(
- TENSOR4D(src, SRC_TENSOR_TYPE),
- TENSOR4D(dst, DST_TENSOR_TYPE))
+ TENSOR4D_T(src, SRC_TENSOR_TYPE),
+ TENSOR4D_T(dst, DST_TENSOR_TYPE),
+ const float scale_x,
+ const float scale_y)
{
- // All the tensor dimensions are passed at compile time.
- // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _IDST_WIDTH DST_WIDTH
-#define _IDST_HEIGHT DST_HEIGHT
-#define _ISCALE_X SCALE_X
-#define _ISCALE_Y SCALE_Y
-
const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
const int xo = GET_SPATIAL_IDX(1, 1, 0); // WIDTH
#if defined(BATCHED_EXECUTION)
- const int yo = GET_SPATIAL_IDX(2, 1, 0) % _IDST_HEIGHT; // HEIGHT
- const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX
-#else // defined(BATCHED_EXECUTION)
+ const int yo = GET_SPATIAL_IDX(2, 1, 0) % dst_h; // HEIGHT
+ const int bout = GET_SPATIAL_IDX(2, 1, 0) / dst_h; // BATCH SIZE IDX
+#else // defined(BATCHED_EXECUTION)
const int yo = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
const int bout = 0; // BATCH SIZE IDX
-#endif // defined(BATCHED_EXECUTION)
+#endif // defined(BATCHED_EXECUTION)
#ifdef SAMPLING_POLICY_TOP_LEFT
- float xi_f = (xo * (float)SCALE_X);
- float yi_f = (yo * (float)SCALE_Y);
+ float xi_f = (xo * scale_x);
+ float yi_f = (yo * scale_y);
#elif SAMPLING_POLICY_CENTER
- float xi_f = ((xo + 0.5f) * (float)SCALE_X - 0.5f);
- float yi_f = ((yo + 0.5f) * (float)SCALE_Y - 0.5f);
+ float xi_f = ((xo + 0.5f) * scale_x - 0.5f);
+ float yi_f = ((yo + 0.5f) * scale_y - 0.5f);
#else // SAMPLING_POLICY
#error("Unsupported sampling policy");
#endif // SAMPLING_POLICY
@@ -210,20 +191,20 @@ __kernel void scale_bilinear_nhwc(
in11[0].v = CONSTANT_VALUE;
#ifndef BORDER_MODE_REPLICATE
- T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in00);
- T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi + 1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in01);
- T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in10);
- T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi + 1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, true, in11);
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi, cout, src_w, src_h, 1, 1, true, in00);
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi, xi + 1, cout, src_w, src_h, 1, 1, true, in01);
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi, cout, src_w, src_h, 1, 1, true, in10);
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi + 1, xi + 1, cout, src_w, src_h, 1, 1, true, in11);
#else // BORDER_MODE_REPLICATE
- const int xi0 = clamp(xi, 0, _ISRC_WIDTH - 1);
- const int yi0 = clamp(yi, 0, _ISRC_HEIGHT - 1);
- const int xi1 = clamp(xi + 1, 0, _ISRC_WIDTH - 1);
- const int yi1 = clamp(yi + 1, 0, _ISRC_HEIGHT - 1);
+ const int xi0 = clamp(xi, 0, (int)src_w - 1);
+ const int yi0 = clamp(yi, 0, (int)src_h - 1);
+ const int xi1 = clamp(xi + 1, 0, (int)src_w - 1);
+ const int yi1 = clamp(yi + 1, 0, (int)src_h - 1);
- T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in00);
- T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in01);
- T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi0, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in10);
- T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi1, cout, _ISRC_WIDTH, _ISRC_HEIGHT, 1, 1, false, in11);
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi0, cout, src_w, src_h, 1, 1, false, in00);
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi0, xi1, cout, src_w, src_h, 1, 1, false, in01);
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi0, cout, src_w, src_h, 1, 1, false, in10);
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, 1, N0, SRC_TENSOR_TYPE, src, bout, yi1, xi1, cout, src_w, src_h, 1, 1, false, in11);
#endif // BORDER_MODE_REPLICATE
TILE(DST_DATA_TYPE, 1, N0, out);
@@ -270,9 +251,10 @@ __kernel void scale_bilinear_nhwc(
TILE(uint, 1, 1, dst_indirect_y);
// Calculate the destination indirect Y
- dst_indirect_y[0].v = xo + (yo * (int)(_IDST_WIDTH)) + bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+ dst_indirect_y[0].v = xo + (yo * (int)(dst_w)) + bout * (int)(dst_w * dst_h);
bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, out, dst_indirect_y);
-} \ No newline at end of file
+}
+#endif /* SCALE_BILINEAR */ \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h
index f36f273e1d..cc20616867 100644
--- a/src/core/CL/cl_kernels/tile_helpers.h
+++ b/src/core/CL/cl_kernels/tile_helpers.h
@@ -104,6 +104,32 @@
#define TENSOR4D_STR(name, type) TENSOR4D_##type(name)
#define TENSOR4D(name, type) TENSOR4D_STR(name, type)
+#define TENSOR4D_T_IMAGE(name) \
+ __read_only image2d_t name##_img, \
+ __global uchar *name##_ptr, \
+ uint name##_stride_y, \
+ uint name##_stride_z, \
+ uint name##_stride_w, \
+ uint name##_c, \
+ uint name##_w, \
+ uint name##_h, \
+ uint name##_n, \
+ uint name##_offset_first_element_in_bytes
+
+#define TENSOR4D_T_BUFFER(name) \
+ __global uchar *name##_ptr, \
+ uint name##_stride_y, \
+ uint name##_stride_z, \
+ uint name##_stride_w, \
+ uint name##_c, \
+ uint name##_w, \
+ uint name##_h, \
+ uint name##_n, \
+ uint name##_offset_first_element_in_bytes
+
+#define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name)
+#define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type)
+
#if !defined(UNROLL_WITH_PRAGMA)
#define UNROLL_INCR(idx, step, macro) idx += (step); (macro)
diff --git a/src/gpu/cl/kernels/ClScaleKernel.cpp b/src/gpu/cl/kernels/ClScaleKernel.cpp
index d63c0e1754..6f16adc657 100644
--- a/src/gpu/cl/kernels/ClScaleKernel.cpp
+++ b/src/gpu/cl/kernels/ClScaleKernel.cpp
@@ -117,9 +117,7 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
const unsigned int src_width = src->dimension(idx_width);
const unsigned int src_height = src->dimension(idx_height);
- const unsigned int src_channel = src->dimension(idx_channel);
const unsigned int dst_width = dst->dimension(idx_width);
- const unsigned int dst_height = dst->dimension(idx_height);
const unsigned int dst_channels = dst->dimension(idx_channel);
unsigned int vec_size = 0;
unsigned int vec_size_leftover = 0;
@@ -130,20 +128,13 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
vec_size = adjust_vec_size(src->data_type() == DataType::F32 ? 4 : 8, dst_channels);
vec_size_leftover = dst_channels % vec_size;
build_opts.add_option("-DSRC_TENSOR_TYPE=BUFFER");
- build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width));
- build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height));
- build_opts.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src_channel));
build_opts.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
build_opts.add_option("-DDST_TENSOR_TYPE=BUFFER");
- build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst_width));
- build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst_height));
- build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst_channels));
build_opts.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type()));
- build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x));
- build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y));
build_opts.add_option("-DN0=" + support::cpp11::to_string(vec_size));
build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(vec_size_leftover));
+ build_opts.add_option("-DSCALE_" + string_from_interpolation_policy(interpolation_policy_to_use));
build_opts.add_option_if(src->num_dimensions() > 3, "-DBATCHED_EXECUTION");
build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
@@ -203,6 +194,13 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+ // Pass scale kernel arguments
+ if(is_nhwc)
+ {
+ unsigned int idx = 2 * num_arguments_per_4d_tensor_nhwc();
+ _kernel.setArg<cl_float>(idx++, scale_x);
+ _kernel.setArg<cl_float>(idx++, scale_y);
+ }
// Set config_id for enabling LWS tuning
_config_id = "scale_";
_config_id += (info.border_mode == BorderMode::REPLICATE ? "Bord_rep" : "");
@@ -248,8 +246,8 @@ void ClScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma
Window slice = collapsed.first_slice_window_4D();
unsigned int idx = 0;
- add_4D_tensor_argument(idx, src, slice);
- add_4D_tensor_argument(idx, dst, slice);
+ add_4d_tensor_nhwc_argument(idx, src);
+ add_4d_tensor_nhwc_argument(idx, dst);
enqueue(queue, *this, slice, lws_hint());
break;
}