diff options
Diffstat (limited to 'src/core/CL/cl_kernels')
-rw-r--r-- | src/core/CL/cl_kernels/helpers.h | 28 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/direct_convolution.cl | 42 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl | 24 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl | 21 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/indirect_convolution.cl | 40 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/scale.cl | 64 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/transposed_convolution.cl | 14 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/tile_helpers.h | 86 |
8 files changed, 207 insertions, 112 deletions
diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h index 298edc244f..c5d94ccb04 100644 --- a/src/core/CL/cl_kernels/helpers.h +++ b/src/core/CL/cl_kernels/helpers.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2022 Arm Limited. + * Copyright (c) 2016-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -438,6 +438,16 @@ #define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) +#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); +#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); +#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); + +#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) +#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); +#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); +#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); +#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) + /** Utility macro to read a 2D OpenCL image object. * * @note Coordinates are not normalized @@ -454,6 +464,22 @@ #define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) #define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) +/** Utility macro to write a 2D OpenCL image object. + * + * @note Coordinates are not normalized + * + * @param[in] data_type Data type + * @param[in] n0 Number of pixel to write. Only 1,2 and 4 is supported + * @param[in] img OpenCL image object + * @param[in] x_coord The x coordinate for the top-left pixel + * @param[in] y_coord The y coordinate for the top-left pixel + * @param[in] values Values to write + * + * @{ + */ +#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) +#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) + #define VSTORE_STR(size) vstore##size #define VSTORE(size) VSTORE_STR(size) diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl index 4693a1fbcd..81ceeb8846 100644 --- a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl +++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -66,36 +66,36 @@ * - The weights offset e.g. -DWEI_OFFSET=4 * - The quantized zero value e.g. -DZERO_VALUE=4 * + * @param[in] src_img (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE * @param[in] src_ptr Pointer to the source tensor. Supported data type: F16/F32/QASYMM8 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_c The size of the channels dimension of the source tensor + * @param[in] src_w The size of the width dimension of the source tensor + * @param[in] src_h The size of the height dimension of the source tensor + * @param[in] src_n The size of the batches dimension of the source tensor * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_img (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_c The size of the channels dimension of the destination tensor + * @param[in] dst_w The size of the width dimension of the destination tensor + * @param[in] dst_h The size of the height dimension of the destination tensor + * @param[in] dst_n The size of the batches dimension of the destination tensor * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] wei_img (Optional) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE * @param[in] wei_ptr Pointer to the weights tensor. Supported data type: same as @p src_ptr - * @param[in] wei_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] wei_step_x wei_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] wei_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] wei_step_y wei_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] wei_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] wei_step_z wei_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] wei_stride_w Stride of the weights tensor in W dimension (in bytes) - * @param[in] wei_step_w wei_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] wei_offset_first_element_in_bytes The offset of the first element in the bias matrix + * @param[in] wei_c The size of the channels dimension of the weights tensor + * @param[in] wei_w The size of the width dimension of the weights tensor + * @param[in] wei_h The size of the height dimension of the weights tensor + * @param[in] wei_n The size of the batches dimension of the weights tensor + * @param[in] wei_offset_first_element_in_bytes The offset of the first element in the weights matrix * @param[in] bia_ptr (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr (if F32/F16) or S32 (if QASYMM8/QASYMM8_SIGNED) * @param[in] bia_stride_x (Optional) Stride of the bias tensor in X dimension (in bytes) * @param[in] bia_step_x (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes) @@ -103,9 +103,9 @@ */ //! @endcond __kernel void direct_convolution_nhwc( - TENSOR4D_T(src, SRC_TENSOR_TYPE), - TENSOR4D_T(dst, DST_TENSOR_TYPE), - TENSOR4D_T(wei, WEI_TENSOR_TYPE) + TENSOR4D_RO_T(src, SRC_TENSOR_TYPE), + TENSOR4D_WO_T(dst, DST_TENSOR_TYPE), + TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE) #if defined(HAS_BIAS) , VECTOR_DECLARATION(bia) @@ -292,4 +292,4 @@ __kernel void direct_convolution_nhwc( #undef _IDST_HEIGHT #undef _IDST_CHANNELS #undef _IY_MULTIPLIER -}
\ No newline at end of file +} diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl index 345469063a..dcbae220b6 100644 --- a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl +++ b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -54,6 +54,7 @@ * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1) * @note The number of columns to read from the src tensor must be passed at compile time using -DN0_A. It can either be 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1) * + * @param[in] src_img (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE * @param[in] src_ptr Pointer to the source tensor. Supported data type: F16/F32 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) @@ -63,6 +64,7 @@ * @param[in] src_h The size of the height dimension of the source tensor * @param[in] src_n The size of the batches dimension of the source tensor * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_img (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p src_ptr * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) @@ -72,16 +74,16 @@ * @param[in] dst_h The size of the height dimension of the destination tensor * @param[in] dst_n The size of the batches dimension of the destination tensor * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] wei_img (Optional) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE * @param[in] wei_ptr Pointer to the weights tensor. Supported data type: same as @p src_ptr - * @param[in] wei_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] wei_step_x wei_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] wei_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] wei_step_y wei_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] wei_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] wei_step_z wei_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] wei_stride_w Stride of the weights tensor in W dimension (in bytes) - * @param[in] wei_step_w wei_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] wei_offset_first_element_in_bytes The offset of the first element in the bias matrix + * @param[in] wei_c The size of the channels dimension of the weights tensor + * @param[in] wei_w The size of the width dimension of the weights tensor + * @param[in] wei_h The size of the height dimension of the weights tensor + * @param[in] wei_n The size of the batches dimension of the weights tensor + * @param[in] wei_offset_first_element_in_bytes The offset of the first element in the weigts matrix * @param[in] bia_ptr (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr * @param[in] bia_stride_x (Optional) Stride of the bias tensor in X dimension (in bytes) * @param[in] bia_step_x (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes) @@ -89,9 +91,9 @@ */ //! @endcond __kernel void dwc_native_fp_nhwc( - TENSOR4D_T(src, SRC_TENSOR_TYPE), - TENSOR4D_T(dst, DST_TENSOR_TYPE), - TENSOR4D(wei, WEI_TENSOR_TYPE) + TENSOR4D_RO_T(src, SRC_TENSOR_TYPE), + TENSOR4D_WO_T(dst, DST_TENSOR_TYPE), + TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE) #if defined(HAS_BIAS) , VECTOR_DECLARATION(bia) @@ -206,4 +208,4 @@ __kernel void dwc_native_fp_nhwc( } #endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) // *INDENT-ON* -// clang-format on
\ No newline at end of file +// clang-format on diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl index e2ffd444dd..2d255e5b61 100644 --- a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl +++ b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -77,6 +77,7 @@ * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1) * @note The number of columns to read from the src tensor must be passed at compile time using -DN0_A. It can either be 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1) * + * @param[in] src_img (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE * @param[in] src_ptr Pointer to the source tensor. Supported data type: QSYMM8/QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) @@ -86,6 +87,7 @@ * @param[in] src_h The size of the height dimension of the source tensor * @param[in] src_n The size of the batches dimension of the source tensor * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_img (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p src_ptr * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) @@ -95,14 +97,15 @@ * @param[in] dst_h The size of the height dimension of the destination tensor * @param[in] dst_n The size of the batches dimension of the destination tensor * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] wei_img (Not supported) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE * @param[in] wei_ptr Pointer to the weights tensor. Supported data type: same as @p src_ptr - * @param[in] wei_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] wei_step_x wei_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] wei_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] wei_step_y wei_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] wei_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] wei_step_z wei_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] wei_stride_w Stride of the weights tensor in W dimension (in bytes) + * @param[in] wei_c The size of the channels dimension of the weights tensor + * @param[in] wei_w The size of the width dimension of the weights tensor + * @param[in] wei_h The size of the height dimension of the weights tensor + * @param[in] wei_n The size of the batches dimension of the weights tensor * @param[in] wei_step_w wei_stride_w * number of elements along W processed per workitem(in bytes) * @param[in] wei_offset_first_element_in_bytes The offset of the first element in the weights tensor * @param[in] dst_multipliers_ptr Pointer to the destination multipliers tensor for the per-channel quantization. Supported data type: S32 @@ -120,9 +123,9 @@ */ //! @endcond __kernel void dwc_native_quantized_nhwc( - TENSOR4D_T(src, SRC_TENSOR_TYPE), - TENSOR4D_T(dst, DST_TENSOR_TYPE), - TENSOR4D(wei, WEI_TENSOR_TYPE), + TENSOR4D_RO_T(src, SRC_TENSOR_TYPE), + TENSOR4D_WO_T(dst, DST_TENSOR_TYPE), + TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE), VECTOR_DECLARATION(dst_multipliers), VECTOR_DECLARATION(dst_shifts) #if defined(HAS_BIAS) @@ -269,4 +272,4 @@ __kernel void dwc_native_quantized_nhwc( } #endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) // *INDENT-ON* -// clang-format on
\ No newline at end of file +// clang-format on diff --git a/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl b/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl index c88f0034c5..aa719bfef0 100644 --- a/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl +++ b/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -41,7 +41,7 @@ * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2) * - M0 = 1, 2, 3, 4, 5, 6, 7, and 8 * - * @param[out] dst_img (Not supported) CLImage object to the destination tensor (DST_TENSOR_TYPE=IMAGE only) + * @param[out] dst_img (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: INT32 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) @@ -54,7 +54,7 @@ */ //! @endcond __kernel void indirect_convolution_address_precalculation( - TENSOR4D_T(dst, DST_TENSOR_TYPE)) + TENSOR4D_WO_T(dst, DST_TENSOR_TYPE)) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -123,7 +123,7 @@ __kernel void indirect_convolution_address_precalculation( * - N0 = 2, 3, 4, 8, 16 * - K0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE) * - * @param[in] src_img (Not supported) CLImage object to the source tensor (SRC_TENSOR_TYPE=IMAGE only) + * @param[in] src_img (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE * @param[in] src_ptr Pointer to the source tensor. Supported data type: F16/F32 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) @@ -133,7 +133,7 @@ __kernel void indirect_convolution_address_precalculation( * @param[in] src_h The size of the height dimension of the source tensor * @param[in] src_n The size of the batches dimension of the source tensor * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] off_img (Not supported) CLImage object to the indirect buffer tensor (OFF_TENSOR_TYPE=IMAGE only) + * @param[in] off_img (Not supported) Read only cl_image object for the indirect buffer tensor. Included when OFF_TENSOR_TYPE=IMAGE * @param[in] off_ptr Pointer to the indirect buffer tensor. Supported data type: INT32 * @param[in] off_stride_y Stride of the indirect buffer tensor in Y dimension (in bytes) * @param[in] off_stride_z Stride of the indirect buffer tensor in Z dimension (in bytes) @@ -143,8 +143,8 @@ __kernel void indirect_convolution_address_precalculation( * @param[in] off_h The size of the height dimension of the indirect buffer tensor * @param[in] off_n The size of the batches dimension of the indirect buffer tensor * @param[in] off_offset_first_element_in_bytes The offset of the first element in the indirect buffer tensor - * @param[out] dst_img (Not supported) CLImage object to the destination tensor (DST_TENSOR_TYPE=IMAGE only) - * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as the input tensor + * @param[out] dst_img (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE + * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p src_ptr * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) @@ -153,8 +153,8 @@ __kernel void indirect_convolution_address_precalculation( * @param[in] dst_h The size of the height dimension of the destination tensor * @param[in] dst_n The size of the batches dimension of the destination tensor * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[out] wei_img (Optional) CLImage object to the destination tensor (WEI_TENSOR_TYPE=IMAGE only) - * @param[out] wei_ptr Pointer to the weights tensor. Supported data type: same as the input tensor + * @param[out] wei_img (Optional) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE + * @param[out] wei_ptr Pointer to the weights tensor. Supported data type: same as @p src_ptr * @param[in] wei_stride_y Stride of the weights tensor in Y dimension (in bytes) * @param[in] wei_stride_z Stride of the weights tensor in Z dimension (in bytes) * @param[in] wei_stride_w Stride of the weights tensor in W dimension (in bytes) @@ -163,23 +163,17 @@ __kernel void indirect_convolution_address_precalculation( * @param[in] wei_h The size of the height dimension of the weights tensor * @param[in] wei_n The size of the batches dimension of the weights tensor * @param[in] wei_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[out] bia_img (Not supported) CLImage object to the destination tensor (BIA_TENSOR_TYPE=IMAGE only) - * @param[out] bia_ptr (Optional) Pointer to the bias tensor. Supported data type: same as the input tensor - * @param[in] bia_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes) - * @param[in] bia_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes) - * @param[in] bia_stride_w (Optional) Stride of the bias tensor in W dimension (in bytes) - * @param[in] bia_c (Optional) The size of the channels dimension of the bias tensor - * @param[in] bia_w (Optional) The size of the width dimension of the bias tensor - * @param[in] bia_h (Optional) The size of the height dimension of the bias tensor - * @param[in] bia_n (Optional) The size of the batches dimension of the bias tensor - * @param[in] bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor + * @param[in] bia_ptr (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr + * @param[in] bia_stride_x (Optional) Stride of the bias tensor in X dimension (in bytes) + * @param[in] bia_step_x (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix */ //! @endcond __kernel void indirect_convolution_nhwc( - TENSOR4D_T(src, SRC_TENSOR_TYPE), - TENSOR4D_T(off, OFF_TENSOR_TYPE), - TENSOR4D_T(dst, DST_TENSOR_TYPE), - TENSOR4D_T(wei, WEI_TENSOR_TYPE) + TENSOR4D_RO_T(src, SRC_TENSOR_TYPE), + TENSOR4D_RO_T(off, OFF_TENSOR_TYPE), + TENSOR4D_WO_T(dst, DST_TENSOR_TYPE), + TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE) #if defined(HAS_BIAS) , VECTOR_DECLARATION(bia) diff --git a/src/core/CL/cl_kernels/nhwc/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl index f6a3e0971b..e071b0f192 100644 --- a/src/core/CL/cl_kernels/nhwc/scale.cl +++ b/src/core/CL/cl_kernels/nhwc/scale.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2022 Arm Limited. + * Copyright (c) 2016-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,8 +61,8 @@ */ //! @endcond __kernel void scale_nearest_neighbour_nhwc( - TENSOR4D_T(src, SRC_TENSOR_TYPE), - TENSOR4D_T(dst, DST_TENSOR_TYPE), + TENSOR4D_RO_T(src, SRC_TENSOR_TYPE), + TENSOR4D_WO_T(dst, DST_TENSOR_TYPE), const float scale_x, const float scale_y) { @@ -128,31 +128,33 @@ __kernel void scale_nearest_neighbour_nhwc( * - The source offset e.g. -DOFFSET=4 * - The source scale e.g. -DSCALE=4 * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32. - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_c The size of the channels dimension of the source tensor - * @param[in] src_w The size of the width dimension of the source tensor - * @param[in] src_h The size of the height dimension of the source tensor - * @param[in] src_n The size of the batches dimension of the source tensor - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: U8/S16/F16/F32. - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_c The size of the channels dimension of the destination tensor - * @param[in] dst_w The size of the width dimension of the destination tensor - * @param[in] dst_h The size of the height dimension of the destination tensor - * @param[in] dst_n The size of the batches dimension of the destination tensor - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] scale_x The scale value to apply on the source width - * @param[in] scale_y The scale value to apply on the source height + * @param[in] src_img (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE + * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32. + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_c The size of the channels dimension of the source tensor + * @param[in] src_w The size of the width dimension of the source tensor + * @param[in] src_h The size of the height dimension of the source tensor + * @param[in] src_n The size of the batches dimension of the source tensor + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_img (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: U8/S16/F16/F32. + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_c The size of the channels dimension of the destination tensor + * @param[in] dst_w The size of the width dimension of the destination tensor + * @param[in] dst_h The size of the height dimension of the destination tensor + * @param[in] dst_n The size of the batches dimension of the destination tensor + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] scale_x The scale value to apply on the source width + * @param[in] scale_y The scale value to apply on the source height */ //! @endcond __kernel void scale_bilinear_nhwc( - TENSOR4D_T(src, SRC_TENSOR_TYPE), - TENSOR4D_T(dst, DST_TENSOR_TYPE), + TENSOR4D_RO_T(src, SRC_TENSOR_TYPE), + TENSOR4D_WO_T(dst, DST_TENSOR_TYPE), const float scale_x, const float scale_y) { @@ -224,11 +226,11 @@ __kernel void scale_bilinear_nhwc( const float a1 = (yi_f - (float)yi); const float b1 = (1.f - a1); - out[0].v = CONVERT_SAT((CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) + - (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) + - (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) + - (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1), - VEC_DATA_TYPE(DST_DATA_TYPE, N0)); + out[0].v = CONVERT_SAT((CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) + + (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) + + (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) + + (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1), + VEC_DATA_TYPE(DST_DATA_TYPE, N0)); #endif // defined(IS_FLOATING_POINT) TILE(uint, 1, 1, dst_indirect_y); @@ -240,4 +242,4 @@ __kernel void scale_bilinear_nhwc( T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, out, dst_indirect_y); } -#endif /* SCALE_BILINEAR */
\ No newline at end of file +#endif /* SCALE_BILINEAR */ diff --git a/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl b/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl index fe6182fc95..1393537283 100644 --- a/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl +++ b/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -64,7 +64,7 @@ * - The weights offset e.g. -DWEI_OFFSET=4 * - The quantized zero value e.g. -DZERO_VALUE=4 * - * + * @param[in] src_img (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE * @param[in] src_ptr Pointer to the source tensor. Supported data type: F16/F32 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) @@ -73,6 +73,7 @@ * @param[in] src_w The size of the width dimension of the source tensor * @param[in] src_h The size of the height dimension of the source tensor * @param[in] src_n The size of the batches dimension of the source tensor + * @param[out] dst_img (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p src_ptr * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) @@ -82,6 +83,7 @@ * @param[in] dst_h The size of the height dimension of the destination tensor * @param[in] dst_n The size of the batches dimension of the destination tensor * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] wei_img (Not supported) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE * @param[in] wei_ptr Pointer to the weights tensor. Supported data type: same as @p src_ptr * @param[in] wei_stride_y Stride of the weights tensor in Y dimension (in bytes) * @param[in] wei_stride_z Stride of the weights tensor in Z dimension (in bytes) @@ -98,9 +100,9 @@ */ //! @endcond __kernel void transposed_convolution_nhwc( - TENSOR4D_T(src, SRC_TENSOR_TYPE), - TENSOR4D_T(dst, DST_TENSOR_TYPE), - TENSOR4D_T(wei, WEI_TENSOR_TYPE) + TENSOR4D_RO_T(src, SRC_TENSOR_TYPE), + TENSOR4D_WO_T(dst, DST_TENSOR_TYPE), + TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE) #if defined(HAS_BIAS) , VECTOR_DECLARATION(bia) @@ -292,4 +294,4 @@ __kernel void transposed_convolution_nhwc( #undef _IDST_HEIGHT #undef _IDST_CHANNELS #undef _IY_MULTIPLIER -}
\ No newline at end of file +} diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h index acc174d04f..507e172dfb 100644 --- a/src/core/CL/cl_kernels/tile_helpers.h +++ b/src/core/CL/cl_kernels/tile_helpers.h @@ -130,8 +130,44 @@ uint name##_offset_first_element_in_bytes #define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name) + +/** Legacy tensor 4D arguments + * + * @param[in] name Tensor name. The tensor name is the prefix of the tensor components + * @param[in] type Tensor type (BUFFER or IMAGE) + */ #define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type) +#define TENSOR4D_RO_T_IMAGE(name) \ + __read_only image2d_t name##_img, \ + TENSOR4D_T_BUFFER(name) + +#define TENSOR4D_RO_T_BUFFER(name) TENSOR4D_T_BUFFER(name) + +#define TENSOR4D_RO_T_STR(name, type) TENSOR4D_RO_T_##type(name) + +/** Read-Only (RO) tensor 4D. + * + * @param[in] name Tensor name. The tensor name is the prefix of the tensor components + * @param[in] type Tensor type (BUFFER or IMAGE) + */ +#define TENSOR4D_RO_T(name, type) TENSOR4D_RO_T_STR(name, type) + +#define TENSOR4D_WO_T_IMAGE(name) \ + __write_only image2d_t name##_img, \ + TENSOR4D_T_BUFFER(name) + +#define TENSOR4D_WO_T_BUFFER(name) TENSOR4D_T_BUFFER(name) + +#define TENSOR4D_WO_T_STR(name, type) TENSOR4D_WO_T_##type(name) + +/** Write-Only (WO) tensor 4D. + * + * @param[in] name Tensor name. The tensor name is the prefix of the tensor components + * @param[in] type Tensor type (BUFFER or IMAGE) + */ +#define TENSOR4D_WO_T(name, type) TENSOR4D_WO_T_STR(name, type) + #define TENSOR3D_T_IMAGE(name) \ __read_only image2d_t name##_img, \ __global uchar *name##_ptr, \ @@ -457,6 +493,25 @@ (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y))) #define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y)) +/** Store a vector in global memory (tensor) + * + * @param[in] DATA_TYPE Data type + * @param[in] WIDTH Number of dst columns + * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). + * In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16) + * @param[in] TENSOR Tensor basename + * @param[in] X Starting X position + * @param[in] Y Starting Y position + * @param[in] STRIDE_Y Stride Y (in bytes) + * @param[in] VALUES Values to store in memory + */ +#define V_STORE(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) +#define V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) +#define V_STORE_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) \ + VSTORE(WIDTH) \ + (VALUES, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y))) +#define V_STORE_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) WRITE_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y), VALUES) + /** Load a tile from global memory (tensor) * * @param[in] DATA_TYPE Data type @@ -658,7 +713,8 @@ * @param[in] DATA_TYPE Data type * @param[in] TILE_AREA Number of elements to load from Y (height) dimension * Number of elements to load from X (width) dimension * @param[in] TILE_CHANNELS Number of elements to load from C (channel) dimension - * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported + * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). + * When TENSOR_TYPE=IMAGE, the if condition for the out-of-bound check can be skipped * In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16) * @param[in] TENSOR Tensor basename * @param[in] C Starting C index @@ -667,15 +723,25 @@ * 16 is the maximum indirect buffer size. * @param[out] dst Output tile */ -#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \ - ({ \ - LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ - { \ - if(yi[0].s[_i] >= 0) \ - { \ - dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \ - } \ - }) \ +#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) +#define T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_##TENSOR_TYPE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) +#define T_LOAD2D_INDIRECT_BUFFER(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \ + ({ \ + LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ + { \ + if(yi[0].s[_i] >= 0) \ + { \ + dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \ + } \ + }) \ + }) + +#define T_LOAD2D_INDIRECT_IMAGE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \ + ({ \ + LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ + { \ + dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \ + }) \ }) /** Load a tile from global memory (tensor) when the tensor is stored using a NDHWC layout using indirect X, Y and Z coordinates |