From 3cce35dcad8bc8f53a1e6613f719af9ab04feda6 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Fri, 30 Dec 2022 16:07:45 +0000 Subject: Extend cl image support to input and output tensors - Add support for texture image to input and output of direct convolution - Extend T_LOAD2D_INDIRECT macro to read values from cl image storages Resolves COMPMID-5715 Signed-off-by: Gian Marco Iodice Change-Id: Idb0410f53f6d0763cd9e39895a7cbf9bc826d33a Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8904 Comments-Addressed: Arm Jenkins Reviewed-by: Viet-Hoa Do Tested-by: Arm Jenkins Benchmark: Arm Jenkins --- arm_compute/core/KernelDescriptors.h | 4 +- src/core/CL/CLUtils.cpp | 18 ++++- src/core/CL/CLUtils.h | 12 ++- src/core/CL/cl_kernels/helpers.h | 28 ++++++- src/core/CL/cl_kernels/nhwc/direct_convolution.cl | 42 +++++------ src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl | 24 +++--- .../cl_kernels/nhwc/dwc_native_quantized_nhwc.cl | 21 +++--- .../CL/cl_kernels/nhwc/indirect_convolution.cl | 40 +++++----- src/core/CL/cl_kernels/nhwc/scale.cl | 64 ++++++++-------- .../CL/cl_kernels/nhwc/transposed_convolution.cl | 14 ++-- src/core/CL/cl_kernels/tile_helpers.h | 86 +++++++++++++++++++--- .../CLDepthwiseConvolutionLayerNativeKernel.cpp | 9 ++- .../runtime/gpu/cl/ClKernelRuntime.cpp | 8 +- src/gpu/cl/kernels/ClDirectConv2dKernel.cpp | 67 ++++++++++++++--- src/gpu/cl/kernels/ClDirectConv2dKernel.h | 6 +- .../kernels/ClGemmMatrixMultiplyReshapedKernel.cpp | 4 +- .../ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp | 4 +- ...GemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp | 4 +- src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp | 4 +- 19 files changed, 312 insertions(+), 147 deletions(-) diff --git a/arm_compute/core/KernelDescriptors.h b/arm_compute/core/KernelDescriptors.h index 4a64032b14..19ac254c04 100644 --- a/arm_compute/core/KernelDescriptors.h +++ b/arm_compute/core/KernelDescriptors.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022 Arm Limited. + * Copyright (c) 2019-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -117,6 +117,8 @@ struct DirectConvComputeKernelInfo int32_t n0{ 1 }; /**< Number of columns to be processed by the kernel */ int32_t k0{ 1 }; /**< Number of partial accumulations to be processed in a single iteration by the kernel */ bool export_weights_to_cl_image{ false }; /**< Flag to export the weights to cl_image */ + bool export_output_to_cl_image{ false }; /**< Flag to export the output to cl_image */ + bool export_input_to_cl_image{ false }; /**< Flag to export the input to cl_image */ }; /** Descriptor used by the softmax kernels */ diff --git a/src/core/CL/CLUtils.cpp b/src/core/CL/CLUtils.cpp index 8f39c2d700..84cf88e099 100644 --- a/src/core/CL/CLUtils.cpp +++ b/src/core/CL/CLUtils.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022 Arm Limited. + * Copyright (c) 2020-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -32,7 +32,7 @@ namespace arm_compute { -cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch) +cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType type) { cl_channel_type cl_data_type; @@ -61,7 +61,17 @@ cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer desc.image_width = shape2d[0]; desc.image_height = shape2d[1]; - cl_image = clCreateImage(ctx(), CL_MEM_READ_ONLY, &format, &desc, nullptr, &err); + switch(type) + { + case CLImage2DType::ReadOnly: + cl_image = clCreateImage(ctx(), CL_MEM_READ_ONLY, &format, &desc, nullptr, &err); + break; + case CLImage2DType::WriteOnly: + cl_image = clCreateImage(ctx(), CL_MEM_WRITE_ONLY, &format, &desc, nullptr, &err); + break; + default: + ARM_COMPUTE_ERROR("Unsupported CLImage2DType"); + } ARM_COMPUTE_UNUSED(err); ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Error during the creation of CL image from buffer"); @@ -176,4 +186,4 @@ void PostOpCLKernelUtils::set_post_ops_cl_kernel_name(std::string &kernel_name, } } // namespace experimental -} // namespace arm_compute \ No newline at end of file +} // namespace arm_compute diff --git a/src/core/CL/CLUtils.h b/src/core/CL/CLUtils.h index d133e4fe6f..b31944c72f 100644 --- a/src/core/CL/CLUtils.h +++ b/src/core/CL/CLUtils.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -34,6 +34,13 @@ class TensorShape; class CLBuildOptions; class ITensorInfo; +/** OpenCL Image2D types */ +enum class CLImage2DType +{ + ReadOnly, + WriteOnly +}; + /** Create a cl::Image2D object from an OpenCL buffer * * @note The following conditions are required to create a OpenCL image object from OpenCL buffer, @@ -49,10 +56,11 @@ class ITensorInfo; * @param[in] shape2d 2D tensor shape * @param[in] data_type DataType to use. Only supported: F32,F16 * @param[in] image_row_pitch Image row pitch (a.k.a. stride Y) to be used in the image2d object + * @param[in] image_type Image 2D type (@ref CLImage2DType) * * @return cl::Image2D object */ -cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch); +cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type); namespace experimental { diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h index 298edc244f..c5d94ccb04 100644 --- a/src/core/CL/cl_kernels/helpers.h +++ b/src/core/CL/cl_kernels/helpers.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2022 Arm Limited. + * Copyright (c) 2016-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -438,6 +438,16 @@ #define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) +#define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values)); +#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567)); +#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); + +#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) +#define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values)); +#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567)); +#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF)); +#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) + /** Utility macro to read a 2D OpenCL image object. * * @note Coordinates are not normalized @@ -454,6 +464,22 @@ #define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) #define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) +/** Utility macro to write a 2D OpenCL image object. + * + * @note Coordinates are not normalized + * + * @param[in] data_type Data type + * @param[in] n0 Number of pixel to write. Only 1,2 and 4 is supported + * @param[in] img OpenCL image object + * @param[in] x_coord The x coordinate for the top-left pixel + * @param[in] y_coord The y coordinate for the top-left pixel + * @param[in] values Values to write + * + * @{ + */ +#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values) +#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) + #define VSTORE_STR(size) vstore##size #define VSTORE(size) VSTORE_STR(size) diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl index 4693a1fbcd..81ceeb8846 100644 --- a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl +++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -66,36 +66,36 @@ * - The weights offset e.g. -DWEI_OFFSET=4 * - The quantized zero value e.g. -DZERO_VALUE=4 * + * @param[in] src_img (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE * @param[in] src_ptr Pointer to the source tensor. Supported data type: F16/F32/QASYMM8 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_c The size of the channels dimension of the source tensor + * @param[in] src_w The size of the width dimension of the source tensor + * @param[in] src_h The size of the height dimension of the source tensor + * @param[in] src_n The size of the batches dimension of the source tensor * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_img (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_c The size of the channels dimension of the destination tensor + * @param[in] dst_w The size of the width dimension of the destination tensor + * @param[in] dst_h The size of the height dimension of the destination tensor + * @param[in] dst_n The size of the batches dimension of the destination tensor * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] wei_img (Optional) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE * @param[in] wei_ptr Pointer to the weights tensor. Supported data type: same as @p src_ptr - * @param[in] wei_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] wei_step_x wei_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] wei_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] wei_step_y wei_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] wei_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] wei_step_z wei_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] wei_stride_w Stride of the weights tensor in W dimension (in bytes) - * @param[in] wei_step_w wei_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] wei_offset_first_element_in_bytes The offset of the first element in the bias matrix + * @param[in] wei_c The size of the channels dimension of the weights tensor + * @param[in] wei_w The size of the width dimension of the weights tensor + * @param[in] wei_h The size of the height dimension of the weights tensor + * @param[in] wei_n The size of the batches dimension of the weights tensor + * @param[in] wei_offset_first_element_in_bytes The offset of the first element in the weights matrix * @param[in] bia_ptr (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr (if F32/F16) or S32 (if QASYMM8/QASYMM8_SIGNED) * @param[in] bia_stride_x (Optional) Stride of the bias tensor in X dimension (in bytes) * @param[in] bia_step_x (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes) @@ -103,9 +103,9 @@ */ //! @endcond __kernel void direct_convolution_nhwc( - TENSOR4D_T(src, SRC_TENSOR_TYPE), - TENSOR4D_T(dst, DST_TENSOR_TYPE), - TENSOR4D_T(wei, WEI_TENSOR_TYPE) + TENSOR4D_RO_T(src, SRC_TENSOR_TYPE), + TENSOR4D_WO_T(dst, DST_TENSOR_TYPE), + TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE) #if defined(HAS_BIAS) , VECTOR_DECLARATION(bia) @@ -292,4 +292,4 @@ __kernel void direct_convolution_nhwc( #undef _IDST_HEIGHT #undef _IDST_CHANNELS #undef _IY_MULTIPLIER -} \ No newline at end of file +} diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl index 345469063a..dcbae220b6 100644 --- a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl +++ b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -54,6 +54,7 @@ * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1) * @note The number of columns to read from the src tensor must be passed at compile time using -DN0_A. It can either be 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1) * + * @param[in] src_img (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE * @param[in] src_ptr Pointer to the source tensor. Supported data type: F16/F32 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) @@ -63,6 +64,7 @@ * @param[in] src_h The size of the height dimension of the source tensor * @param[in] src_n The size of the batches dimension of the source tensor * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_img (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p src_ptr * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) @@ -72,16 +74,16 @@ * @param[in] dst_h The size of the height dimension of the destination tensor * @param[in] dst_n The size of the batches dimension of the destination tensor * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] wei_img (Optional) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE * @param[in] wei_ptr Pointer to the weights tensor. Supported data type: same as @p src_ptr - * @param[in] wei_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] wei_step_x wei_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] wei_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] wei_step_y wei_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] wei_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] wei_step_z wei_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] wei_stride_w Stride of the weights tensor in W dimension (in bytes) - * @param[in] wei_step_w wei_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] wei_offset_first_element_in_bytes The offset of the first element in the bias matrix + * @param[in] wei_c The size of the channels dimension of the weights tensor + * @param[in] wei_w The size of the width dimension of the weights tensor + * @param[in] wei_h The size of the height dimension of the weights tensor + * @param[in] wei_n The size of the batches dimension of the weights tensor + * @param[in] wei_offset_first_element_in_bytes The offset of the first element in the weigts matrix * @param[in] bia_ptr (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr * @param[in] bia_stride_x (Optional) Stride of the bias tensor in X dimension (in bytes) * @param[in] bia_step_x (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes) @@ -89,9 +91,9 @@ */ //! @endcond __kernel void dwc_native_fp_nhwc( - TENSOR4D_T(src, SRC_TENSOR_TYPE), - TENSOR4D_T(dst, DST_TENSOR_TYPE), - TENSOR4D(wei, WEI_TENSOR_TYPE) + TENSOR4D_RO_T(src, SRC_TENSOR_TYPE), + TENSOR4D_WO_T(dst, DST_TENSOR_TYPE), + TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE) #if defined(HAS_BIAS) , VECTOR_DECLARATION(bia) @@ -206,4 +208,4 @@ __kernel void dwc_native_fp_nhwc( } #endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) // *INDENT-ON* -// clang-format on \ No newline at end of file +// clang-format on diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl index e2ffd444dd..2d255e5b61 100644 --- a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl +++ b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022 Arm Limited. + * Copyright (c) 2021-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -77,6 +77,7 @@ * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1) * @note The number of columns to read from the src tensor must be passed at compile time using -DN0_A. It can either be 1 (for DEPTH_MULTIPLIER > 1) or N0 (for DEPTH_MULTIPLIER == 1) * + * @param[in] src_img (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE * @param[in] src_ptr Pointer to the source tensor. Supported data type: QSYMM8/QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) @@ -86,6 +87,7 @@ * @param[in] src_h The size of the height dimension of the source tensor * @param[in] src_n The size of the batches dimension of the source tensor * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_img (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p src_ptr * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) @@ -95,14 +97,15 @@ * @param[in] dst_h The size of the height dimension of the destination tensor * @param[in] dst_n The size of the batches dimension of the destination tensor * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] wei_img (Not supported) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE * @param[in] wei_ptr Pointer to the weights tensor. Supported data type: same as @p src_ptr - * @param[in] wei_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] wei_step_x wei_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] wei_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] wei_step_y wei_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] wei_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] wei_step_z wei_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] wei_stride_w Stride of the weights tensor in W dimension (in bytes) + * @param[in] wei_c The size of the channels dimension of the weights tensor + * @param[in] wei_w The size of the width dimension of the weights tensor + * @param[in] wei_h The size of the height dimension of the weights tensor + * @param[in] wei_n The size of the batches dimension of the weights tensor * @param[in] wei_step_w wei_stride_w * number of elements along W processed per workitem(in bytes) * @param[in] wei_offset_first_element_in_bytes The offset of the first element in the weights tensor * @param[in] dst_multipliers_ptr Pointer to the destination multipliers tensor for the per-channel quantization. Supported data type: S32 @@ -120,9 +123,9 @@ */ //! @endcond __kernel void dwc_native_quantized_nhwc( - TENSOR4D_T(src, SRC_TENSOR_TYPE), - TENSOR4D_T(dst, DST_TENSOR_TYPE), - TENSOR4D(wei, WEI_TENSOR_TYPE), + TENSOR4D_RO_T(src, SRC_TENSOR_TYPE), + TENSOR4D_WO_T(dst, DST_TENSOR_TYPE), + TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE), VECTOR_DECLARATION(dst_multipliers), VECTOR_DECLARATION(dst_shifts) #if defined(HAS_BIAS) @@ -269,4 +272,4 @@ __kernel void dwc_native_quantized_nhwc( } #endif // defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) // *INDENT-ON* -// clang-format on \ No newline at end of file +// clang-format on diff --git a/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl b/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl index c88f0034c5..aa719bfef0 100644 --- a/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl +++ b/src/core/CL/cl_kernels/nhwc/indirect_convolution.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -41,7 +41,7 @@ * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2) * - M0 = 1, 2, 3, 4, 5, 6, 7, and 8 * - * @param[out] dst_img (Not supported) CLImage object to the destination tensor (DST_TENSOR_TYPE=IMAGE only) + * @param[out] dst_img (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: INT32 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) @@ -54,7 +54,7 @@ */ //! @endcond __kernel void indirect_convolution_address_precalculation( - TENSOR4D_T(dst, DST_TENSOR_TYPE)) + TENSOR4D_WO_T(dst, DST_TENSOR_TYPE)) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -123,7 +123,7 @@ __kernel void indirect_convolution_address_precalculation( * - N0 = 2, 3, 4, 8, 16 * - K0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE) * - * @param[in] src_img (Not supported) CLImage object to the source tensor (SRC_TENSOR_TYPE=IMAGE only) + * @param[in] src_img (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE * @param[in] src_ptr Pointer to the source tensor. Supported data type: F16/F32 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) @@ -133,7 +133,7 @@ __kernel void indirect_convolution_address_precalculation( * @param[in] src_h The size of the height dimension of the source tensor * @param[in] src_n The size of the batches dimension of the source tensor * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] off_img (Not supported) CLImage object to the indirect buffer tensor (OFF_TENSOR_TYPE=IMAGE only) + * @param[in] off_img (Not supported) Read only cl_image object for the indirect buffer tensor. Included when OFF_TENSOR_TYPE=IMAGE * @param[in] off_ptr Pointer to the indirect buffer tensor. Supported data type: INT32 * @param[in] off_stride_y Stride of the indirect buffer tensor in Y dimension (in bytes) * @param[in] off_stride_z Stride of the indirect buffer tensor in Z dimension (in bytes) @@ -143,8 +143,8 @@ __kernel void indirect_convolution_address_precalculation( * @param[in] off_h The size of the height dimension of the indirect buffer tensor * @param[in] off_n The size of the batches dimension of the indirect buffer tensor * @param[in] off_offset_first_element_in_bytes The offset of the first element in the indirect buffer tensor - * @param[out] dst_img (Not supported) CLImage object to the destination tensor (DST_TENSOR_TYPE=IMAGE only) - * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as the input tensor + * @param[out] dst_img (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE + * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p src_ptr * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) @@ -153,8 +153,8 @@ __kernel void indirect_convolution_address_precalculation( * @param[in] dst_h The size of the height dimension of the destination tensor * @param[in] dst_n The size of the batches dimension of the destination tensor * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[out] wei_img (Optional) CLImage object to the destination tensor (WEI_TENSOR_TYPE=IMAGE only) - * @param[out] wei_ptr Pointer to the weights tensor. Supported data type: same as the input tensor + * @param[out] wei_img (Optional) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE + * @param[out] wei_ptr Pointer to the weights tensor. Supported data type: same as @p src_ptr * @param[in] wei_stride_y Stride of the weights tensor in Y dimension (in bytes) * @param[in] wei_stride_z Stride of the weights tensor in Z dimension (in bytes) * @param[in] wei_stride_w Stride of the weights tensor in W dimension (in bytes) @@ -163,23 +163,17 @@ __kernel void indirect_convolution_address_precalculation( * @param[in] wei_h The size of the height dimension of the weights tensor * @param[in] wei_n The size of the batches dimension of the weights tensor * @param[in] wei_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[out] bia_img (Not supported) CLImage object to the destination tensor (BIA_TENSOR_TYPE=IMAGE only) - * @param[out] bia_ptr (Optional) Pointer to the bias tensor. Supported data type: same as the input tensor - * @param[in] bia_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes) - * @param[in] bia_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes) - * @param[in] bia_stride_w (Optional) Stride of the bias tensor in W dimension (in bytes) - * @param[in] bia_c (Optional) The size of the channels dimension of the bias tensor - * @param[in] bia_w (Optional) The size of the width dimension of the bias tensor - * @param[in] bia_h (Optional) The size of the height dimension of the bias tensor - * @param[in] bia_n (Optional) The size of the batches dimension of the bias tensor - * @param[in] bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor + * @param[in] bia_ptr (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr + * @param[in] bia_stride_x (Optional) Stride of the bias tensor in X dimension (in bytes) + * @param[in] bia_step_x (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix */ //! @endcond __kernel void indirect_convolution_nhwc( - TENSOR4D_T(src, SRC_TENSOR_TYPE), - TENSOR4D_T(off, OFF_TENSOR_TYPE), - TENSOR4D_T(dst, DST_TENSOR_TYPE), - TENSOR4D_T(wei, WEI_TENSOR_TYPE) + TENSOR4D_RO_T(src, SRC_TENSOR_TYPE), + TENSOR4D_RO_T(off, OFF_TENSOR_TYPE), + TENSOR4D_WO_T(dst, DST_TENSOR_TYPE), + TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE) #if defined(HAS_BIAS) , VECTOR_DECLARATION(bia) diff --git a/src/core/CL/cl_kernels/nhwc/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl index f6a3e0971b..e071b0f192 100644 --- a/src/core/CL/cl_kernels/nhwc/scale.cl +++ b/src/core/CL/cl_kernels/nhwc/scale.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2022 Arm Limited. + * Copyright (c) 2016-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -61,8 +61,8 @@ */ //! @endcond __kernel void scale_nearest_neighbour_nhwc( - TENSOR4D_T(src, SRC_TENSOR_TYPE), - TENSOR4D_T(dst, DST_TENSOR_TYPE), + TENSOR4D_RO_T(src, SRC_TENSOR_TYPE), + TENSOR4D_WO_T(dst, DST_TENSOR_TYPE), const float scale_x, const float scale_y) { @@ -128,31 +128,33 @@ __kernel void scale_nearest_neighbour_nhwc( * - The source offset e.g. -DOFFSET=4 * - The source scale e.g. -DSCALE=4 * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32. - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_c The size of the channels dimension of the source tensor - * @param[in] src_w The size of the width dimension of the source tensor - * @param[in] src_h The size of the height dimension of the source tensor - * @param[in] src_n The size of the batches dimension of the source tensor - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: U8/S16/F16/F32. - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_c The size of the channels dimension of the destination tensor - * @param[in] dst_w The size of the width dimension of the destination tensor - * @param[in] dst_h The size of the height dimension of the destination tensor - * @param[in] dst_n The size of the batches dimension of the destination tensor - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] scale_x The scale value to apply on the source width - * @param[in] scale_y The scale value to apply on the source height + * @param[in] src_img (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE + * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S16/F16/F32. + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_c The size of the channels dimension of the source tensor + * @param[in] src_w The size of the width dimension of the source tensor + * @param[in] src_h The size of the height dimension of the source tensor + * @param[in] src_n The size of the batches dimension of the source tensor + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_img (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: U8/S16/F16/F32. + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_c The size of the channels dimension of the destination tensor + * @param[in] dst_w The size of the width dimension of the destination tensor + * @param[in] dst_h The size of the height dimension of the destination tensor + * @param[in] dst_n The size of the batches dimension of the destination tensor + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] scale_x The scale value to apply on the source width + * @param[in] scale_y The scale value to apply on the source height */ //! @endcond __kernel void scale_bilinear_nhwc( - TENSOR4D_T(src, SRC_TENSOR_TYPE), - TENSOR4D_T(dst, DST_TENSOR_TYPE), + TENSOR4D_RO_T(src, SRC_TENSOR_TYPE), + TENSOR4D_WO_T(dst, DST_TENSOR_TYPE), const float scale_x, const float scale_y) { @@ -224,11 +226,11 @@ __kernel void scale_bilinear_nhwc( const float a1 = (yi_f - (float)yi); const float b1 = (1.f - a1); - out[0].v = CONVERT_SAT((CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) + - (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) + - (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) + - (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1), - VEC_DATA_TYPE(DST_DATA_TYPE, N0)); + out[0].v = CONVERT_SAT((CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) + + (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) + + (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) + + (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1), + VEC_DATA_TYPE(DST_DATA_TYPE, N0)); #endif // defined(IS_FLOATING_POINT) TILE(uint, 1, 1, dst_indirect_y); @@ -240,4 +242,4 @@ __kernel void scale_bilinear_nhwc( T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, 1, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, out, dst_indirect_y); } -#endif /* SCALE_BILINEAR */ \ No newline at end of file +#endif /* SCALE_BILINEAR */ diff --git a/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl b/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl index fe6182fc95..1393537283 100644 --- a/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl +++ b/src/core/CL/cl_kernels/nhwc/transposed_convolution.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -64,7 +64,7 @@ * - The weights offset e.g. -DWEI_OFFSET=4 * - The quantized zero value e.g. -DZERO_VALUE=4 * - * + * @param[in] src_img (Not supported) Read only cl_image object for the source tensor. Included when SRC_TENSOR_TYPE=IMAGE * @param[in] src_ptr Pointer to the source tensor. Supported data type: F16/F32 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) @@ -73,6 +73,7 @@ * @param[in] src_w The size of the width dimension of the source tensor * @param[in] src_h The size of the height dimension of the source tensor * @param[in] src_n The size of the batches dimension of the source tensor + * @param[out] dst_img (Not supported) Write only cl_image object for the destination tensor. Included when DST_TENSOR_TYPE=IMAGE * @param[out] dst_ptr Pointer to the destination tensor. Supported data type: same as @p src_ptr * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) @@ -82,6 +83,7 @@ * @param[in] dst_h The size of the height dimension of the destination tensor * @param[in] dst_n The size of the batches dimension of the destination tensor * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] wei_img (Not supported) Read only cl_image object for the weights tensor. Included when WEI_TENSOR_TYPE=IMAGE * @param[in] wei_ptr Pointer to the weights tensor. Supported data type: same as @p src_ptr * @param[in] wei_stride_y Stride of the weights tensor in Y dimension (in bytes) * @param[in] wei_stride_z Stride of the weights tensor in Z dimension (in bytes) @@ -98,9 +100,9 @@ */ //! @endcond __kernel void transposed_convolution_nhwc( - TENSOR4D_T(src, SRC_TENSOR_TYPE), - TENSOR4D_T(dst, DST_TENSOR_TYPE), - TENSOR4D_T(wei, WEI_TENSOR_TYPE) + TENSOR4D_RO_T(src, SRC_TENSOR_TYPE), + TENSOR4D_WO_T(dst, DST_TENSOR_TYPE), + TENSOR4D_RO_T(wei, WEI_TENSOR_TYPE) #if defined(HAS_BIAS) , VECTOR_DECLARATION(bia) @@ -292,4 +294,4 @@ __kernel void transposed_convolution_nhwc( #undef _IDST_HEIGHT #undef _IDST_CHANNELS #undef _IY_MULTIPLIER -} \ No newline at end of file +} diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h index acc174d04f..507e172dfb 100644 --- a/src/core/CL/cl_kernels/tile_helpers.h +++ b/src/core/CL/cl_kernels/tile_helpers.h @@ -130,8 +130,44 @@ uint name##_offset_first_element_in_bytes #define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name) + +/** Legacy tensor 4D arguments + * + * @param[in] name Tensor name. The tensor name is the prefix of the tensor components + * @param[in] type Tensor type (BUFFER or IMAGE) + */ #define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type) +#define TENSOR4D_RO_T_IMAGE(name) \ + __read_only image2d_t name##_img, \ + TENSOR4D_T_BUFFER(name) + +#define TENSOR4D_RO_T_BUFFER(name) TENSOR4D_T_BUFFER(name) + +#define TENSOR4D_RO_T_STR(name, type) TENSOR4D_RO_T_##type(name) + +/** Read-Only (RO) tensor 4D. + * + * @param[in] name Tensor name. The tensor name is the prefix of the tensor components + * @param[in] type Tensor type (BUFFER or IMAGE) + */ +#define TENSOR4D_RO_T(name, type) TENSOR4D_RO_T_STR(name, type) + +#define TENSOR4D_WO_T_IMAGE(name) \ + __write_only image2d_t name##_img, \ + TENSOR4D_T_BUFFER(name) + +#define TENSOR4D_WO_T_BUFFER(name) TENSOR4D_T_BUFFER(name) + +#define TENSOR4D_WO_T_STR(name, type) TENSOR4D_WO_T_##type(name) + +/** Write-Only (WO) tensor 4D. + * + * @param[in] name Tensor name. The tensor name is the prefix of the tensor components + * @param[in] type Tensor type (BUFFER or IMAGE) + */ +#define TENSOR4D_WO_T(name, type) TENSOR4D_WO_T_STR(name, type) + #define TENSOR3D_T_IMAGE(name) \ __read_only image2d_t name##_img, \ __global uchar *name##_ptr, \ @@ -457,6 +493,25 @@ (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y))) #define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y)) +/** Store a vector in global memory (tensor) + * + * @param[in] DATA_TYPE Data type + * @param[in] WIDTH Number of dst columns + * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). + * In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16) + * @param[in] TENSOR Tensor basename + * @param[in] X Starting X position + * @param[in] Y Starting Y position + * @param[in] STRIDE_Y Stride Y (in bytes) + * @param[in] VALUES Values to store in memory + */ +#define V_STORE(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) +#define V_STORE_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, VALUES) V_STORE_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) +#define V_STORE_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) \ + VSTORE(WIDTH) \ + (VALUES, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y))) +#define V_STORE_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y, VALUES) WRITE_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y), VALUES) + /** Load a tile from global memory (tensor) * * @param[in] DATA_TYPE Data type @@ -658,7 +713,8 @@ * @param[in] DATA_TYPE Data type * @param[in] TILE_AREA Number of elements to load from Y (height) dimension * Number of elements to load from X (width) dimension * @param[in] TILE_CHANNELS Number of elements to load from C (channel) dimension - * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported + * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). + * When TENSOR_TYPE=IMAGE, the if condition for the out-of-bound check can be skipped * In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16) * @param[in] TENSOR Tensor basename * @param[in] C Starting C index @@ -667,15 +723,25 @@ * 16 is the maximum indirect buffer size. * @param[out] dst Output tile */ -#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \ - ({ \ - LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ - { \ - if(yi[0].s[_i] >= 0) \ - { \ - dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \ - } \ - }) \ +#define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) +#define T_LOAD2D_INDIRECT_STR(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) T_LOAD2D_INDIRECT_##TENSOR_TYPE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) +#define T_LOAD2D_INDIRECT_BUFFER(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \ + ({ \ + LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ + { \ + if(yi[0].s[_i] >= 0) \ + { \ + dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \ + } \ + }) \ + }) + +#define T_LOAD2D_INDIRECT_IMAGE(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, STRIDE_Y, yi, dst) \ + ({ \ + LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \ + { \ + dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[0].s[_i], STRIDE_Y); \ + }) \ }) /** Load a tile from global memory (tensor) when the tensor is stored using a NDHWC layout using indirect X, Y and Z coordinates diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp index cded31936c..2d21a6eff0 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022 Arm Limited. + * Copyright (c) 2019-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -377,7 +377,7 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm const size_t image_h = _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3); const TensorShape shape2d(image_w, image_h); const size_t image_row_pitch = _input->info()->strides_in_bytes()[1]; - input_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d, _input->info()->data_type(), image_row_pitch); + input_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d, _input->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); } if(_export_weights_to_cl_image) @@ -386,7 +386,8 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm const size_t image_h = _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3); const TensorShape shape2d(image_w, image_h); const size_t image_row_pitch = _weights->info()->strides_in_bytes()[1]; - weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d, _weights->info()->data_type(), image_row_pitch); + weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d, _weights->info()->data_type(), image_row_pitch, + CLImage2DType::ReadOnly); } } @@ -401,7 +402,7 @@ void CLDepthwiseConvolutionLayerNativeKernel::run(const Window &window, cl::Comm { _kernel.setArg(idx++, weights_cl_image); } - add_4D_tensor_argument(idx, _weights, slice); + add_4d_tensor_nhwc_argument(idx, _weights); if(_is_quantized) { add_1D_tensor_argument(idx, _output_multipliers, slice); diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp index 93fbdfed63..022d4685fe 100644 --- a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp +++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -89,7 +89,7 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer { const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3)); const size_t image_row_pitch = tensor->info()->strides_in_bytes()[1]; - cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch); + cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); cl_images.push_back(tensor_image2d); _kernel.setArg(idx++, tensor_image2d); break; @@ -105,7 +105,7 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer { const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3)); const size_t image_row_pitch = tensor->info()->strides_in_bytes()[1]; - cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch); + cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); cl_images.push_back(tensor_image2d); _kernel.setArg(idx++, tensor_image2d); _kernel.setArg(idx++, static_cast(tensor->info()->strides_in_bytes()[2])); @@ -135,7 +135,7 @@ inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKer const size_t image_stride_y = tensor->info()->strides_in_bytes()[1]; cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), - TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y); + TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y, CLImage2DType::ReadOnly); cl_images.push_back(tensor_image2d); _kernel.setArg(idx++, tensor_image2d); diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp index d9271e24d9..b66163c805 100644 --- a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp +++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -64,6 +64,9 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_input_to_cl_image == true, "Export to CLImage is not supported for the input tensor"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_output_to_cl_image == true, "Export to CLImage is not supported for the output tensor"); + if(data_layout == DataLayout::NCHW) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height"); @@ -210,14 +213,26 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT const unsigned int pad_left = conv_info.pad_left(); const unsigned int pad_top = conv_info.pad_top(); - _export_to_cl_image = desc.export_weights_to_cl_image; + _export_weights_to_cl_image = desc.export_weights_to_cl_image; + _export_input_to_cl_image = desc.export_input_to_cl_image; + _export_output_to_cl_image = desc.export_output_to_cl_image; // Update the padding for the weights tensor if we can export to cl_image - if(_export_to_cl_image) + if(_export_weights_to_cl_image) { gemm::update_padding_for_cl_image(weights); } + if(_export_output_to_cl_image) + { + gemm::update_padding_for_cl_image(dst); + } + + if(_export_input_to_cl_image) + { + gemm::update_padding_for_cl_image(src); + } + if(biases != nullptr) { build_options.add_option(std::string("-DHAS_BIAS")); @@ -241,7 +256,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT build_options.add_option("-cl-fast-relaxed-math"); } - build_options.add_option("-DSRC_TENSOR_TYPE=BUFFER"); + build_options.add_option_if_else(_export_input_to_cl_image, "-DSRC_TENSOR_TYPE=IMAGE", "-DSRC_TENSOR_TYPE=BUFFER"); build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(0))); build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(1))); @@ -249,9 +264,9 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(0))); build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(1))); build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(2))); - build_options.add_option("-DDST_TENSOR_TYPE=BUFFER"); + build_options.add_option_if_else(_export_output_to_cl_image, "-DDST_TENSOR_TYPE=IMAGE", "-DDST_TENSOR_TYPE=BUFFER"); build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type)); - build_options.add_option_if_else(_export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER"); + build_options.add_option_if_else(_export_weights_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER"); build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx))); build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx))); build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type())); @@ -307,7 +322,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT } else { - _export_to_cl_image = false; + _export_weights_to_cl_image = false; kernel_name << "direct_convolution_nchw"; build_options.add_option_if(biases != nullptr, std::string("-DHAS_BIAS")); @@ -399,8 +414,10 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl if(_data_layout == DataLayout::NHWC) { cl::Image2D weights_cl_image; + cl::Image2D output_cl_image; + cl::Image2D input_cl_image; - if(_export_to_cl_image) + if(_export_weights_to_cl_image) { const size_t image_w = weights->info()->dimension(0) / 4; const size_t image_h = weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3); @@ -408,13 +425,43 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl const size_t image_row_pitch = weights->info()->strides_in_bytes()[1]; // Export cl_buffer to cl_image - weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch); + weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); + } + + if(_export_output_to_cl_image) + { + const size_t image_w = dst->info()->dimension(0) / 4; + const size_t image_h = dst->info()->dimension(1) * dst->info()->dimension(2) * dst->info()->dimension(3); + const TensorShape shape2d(image_w, image_h); + const size_t image_row_pitch = dst->info()->strides_in_bytes()[1]; + + // Export cl_buffer to cl_image + output_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), dst->cl_buffer(), shape2d, dst->info()->data_type(), image_row_pitch, CLImage2DType::WriteOnly); + } + + if(_export_input_to_cl_image) + { + const size_t image_w = src->info()->dimension(0) / 4; + const size_t image_h = src->info()->dimension(1) * src->info()->dimension(2) * src->info()->dimension(3); + const TensorShape shape2d(image_w, image_h); + const size_t image_row_pitch = src->info()->strides_in_bytes()[1]; + + // Export cl_buffer to cl_image + input_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), src->cl_buffer(), shape2d, src->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); } unsigned int idx = 0; + if(_export_input_to_cl_image) + { + _kernel.setArg(idx++, input_cl_image); + } add_4d_tensor_nhwc_argument(idx, src); + if(_export_output_to_cl_image) + { + _kernel.setArg(idx++, output_cl_image); + } add_4d_tensor_nhwc_argument(idx, dst); - if(_export_to_cl_image) + if(_export_weights_to_cl_image) { _kernel.setArg(idx++, weights_cl_image); } diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.h b/src/gpu/cl/kernels/ClDirectConv2dKernel.h index 0cb8aebbe1..25171a0536 100644 --- a/src/gpu/cl/kernels/ClDirectConv2dKernel.h +++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Arm Limited. + * Copyright (c) 2017-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -84,7 +84,9 @@ public: public: DataLayout _data_layout{}; PadStrideInfo _conv_info{}; - bool _export_to_cl_image{ false }; + bool _export_weights_to_cl_image{ false }; + bool _export_output_to_cl_image{ false }; + bool _export_input_to_cl_image{ false }; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp index 6a450b652b..f74a5d87af 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -360,7 +360,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2)); const size_t image_row_pitch = src1->info()->strides_in_bytes()[1]; - src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch); + src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); } do diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp index a8bcf8d6a1..efd0a95eab 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022 Arm Limited. + * Copyright (c) 2019-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -375,7 +375,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2)); const size_t image_row_pitch = src1->info()->strides_in_bytes()[1]; - src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch); + src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); } do diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp index fe46913517..f252afb06a 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -325,7 +325,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2)); const size_t image_row_pitch = src1->info()->strides_in_bytes()[1]; - src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch); + src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); } Window slice = window.first_slice_window_3D(); diff --git a/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp index 3448377cb5..b8915cc406 100644 --- a/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp +++ b/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Arm Limited. + * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -282,7 +282,7 @@ void ClIndirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, const size_t image_row_pitch = weights->info()->strides_in_bytes()[1]; // Export cl_buffer to cl_image - weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch); + weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); } unsigned int idx = 0; -- cgit v1.2.1