From e3a849af3d9e108704c6ce162f377398300d990d Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Wed, 10 Jun 2020 17:59:30 +0100 Subject: COMPMID-3320: Add cl_image support for GEMMReshaped T_NT COMPMID-3321: Add cl_image support for GEMMReshaped NT_T - Added support for cl_image in CLGEMMMatrixMultiplyReshapedKernel (both NT and T kernels) - Extended the tests for the validating rhs_info.export_to_cl_image = true - Added utility macros in OpenCL to load data from a OpenCL image object - Updated doxygen documentation in CLGEMMMatrixMultiplyReshapedKernel.h - Updated doxygen documentation in CLGEMMReshapeRHSMatrixKernel.h Change-Id: I953b10e4ef205d1b76dcbc366e5a91fd5a8e1d5c Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3329 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas --- .../kernels/CLGEMMMatrixMultiplyReshapedKernel.h | 59 ++++++++++++++++++---- .../core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h | 45 ++++++++++++++--- arm_compute/core/Types.h | 4 +- 3 files changed, 89 insertions(+), 19 deletions(-) (limited to 'arm_compute') diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h index ee8e57fa8c..aeedd50e0b 100644 --- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h @@ -55,19 +55,30 @@ public: * Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the * multiplications. i.e. float c = (half)a * (half)b * - * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4 + * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function. + * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer, + * the following conditions are required: + * -# rhs_info.n0 can only be 4, 8 and 16 + * -# rhs_info.k0 can only be 4, 8 and 16 + * -# Data type can only be F32 + * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension + * -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement + * -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4) + * -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT + * + * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4 * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3 * @param[in] input2 Input tensor containing the bias matrix. Data type supported: same as @p input0. * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0 * @param[in] alpha Weight of the matrix product * @param[in] beta Weight of the matrix bias - * @param[in] lhs_info LHS matrix information used for reshaping the input0 tensor. Only the following values are supported: + * @param[in] lhs_info LHS matrix information used for reshaping the input0 tensor. Only the following values are supported: * lhs_info.m0: 2,3,4,5,6,7,8 * lhs_info.k0: 2,3,4,8,16 * lhs_info.transpose: false * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: 2,3,4,8,16 + * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true) + * rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true) * rhs_info.transpose: true * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices * @@ -82,8 +93,19 @@ public: * Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the * multiplications. i.e. float c = (half)a * (half)b * + * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function. + * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer, + * the following conditions are required: + * -# rhs_info.n0 can only be 4, 8 and 16 + * -# rhs_info.k0 can only be 4, 8 and 16 + * -# Data type can only be F32 + * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension + * -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement + * -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4) + * -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT + * * @param[in] compile_context The compile context to be used. - * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4 + * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4 * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3 * @param[in] input2 Input tensor containing the bias matrix. Data type supported: same as @p input0. * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0 @@ -94,8 +116,8 @@ public: * lhs_info.k0: 2,3,4,8,16 * lhs_info.transpose: false * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: 2,3,4,8,16 + * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true) + * rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true) * rhs_info.transpose: true * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices * @@ -107,7 +129,22 @@ public: const GEMMKernelInfo &gemm_info); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyReshapedKernel * - * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4 + * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag. + * Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the + * multiplications. i.e. float c = (half)a * (half)b + * + * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function. + * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer, + * the following conditions are required: + * -# rhs_info.n0 can only be 4, 8 and 16 + * -# rhs_info.k0 can only be 4, 8 and 16 + * -# Data type can only be F32 + * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension + * -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement + * -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4) + * -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT + * + * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4 * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3 * @param[in] input2 Input tensor info containing the bias matrix. Data type supported: same as @p input0. * @param[in] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0 @@ -118,8 +155,8 @@ public: * lhs_info.k0: 2,3,4,8,16 * lhs_info.transpose: false * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: 2,3,4,8,16 + * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true) + * rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true) * rhs_info.transpose: true * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices * @@ -141,10 +178,10 @@ private: ICLTensor *_output; bool _slide_matrix_b; bool _reinterpret_output_as_3d; - unsigned int _k; bool _use_dummy_work_items; bool _add_bias; bool _broadcast_bias; + bool _export_to_cl_image; }; } // namespace arm_compute #endif /*ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H*/ \ No newline at end of file diff --git a/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h b/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h index 0e6352bdbb..59a4aaa912 100644 --- a/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h @@ -47,40 +47,73 @@ public: /** Allow instances of this class to be moved */ CLGEMMReshapeRHSMatrixKernel &operator=(CLGEMMReshapeRHSMatrixKernel &&) = default; /** Initialise the kernel's input and output. + * + * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor, + * required to create a OpenCL image object from buffer in @ref CLGEMMMatrixMultiplyReshapedKernel and in @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel + * Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required: + * -# rhs_info.n0 can only be 4, 8 and 16 + * -# rhs_info.k0 can only be 4, 8 and 16 + * -# Data type can only be F32 + * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension + * -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4) + * -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT + * -# The output tensor should be only consumed by @ref CLGEMMMatrixMultiplyReshapedKernel or @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel * * @param[in] input Input tensor. Data types supported: All * @param[out] output Output tensor. Data type supported: same as @p input * @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary * information to reshape the input tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false) + * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true) + * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false), (only 4, 8 and 16 if rhs_info.export_to_cl_image == true) * rhs_info.h0: greater than 0 * rhs_info.transpose: true, false * rhs_info.interleave: true, false */ void configure(const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info); /** Initialise the kernel's input and output. + * + * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor, + * required to create a OpenCL image object from buffer in @ref CLGEMMMatrixMultiplyReshapedKernel and in @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel + * Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required: + * -# rhs_info.n0 can only be 4, 8 and 16 + * -# rhs_info.k0 can only be 4, 8 and 16 + * -# Data type can only be F32 + * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension + * -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4) + * -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT + * -# The output tensor should be only consumed by @ref CLGEMMMatrixMultiplyReshapedKernel or @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel * * @param[in] compile_context The compile context to be used. * @param[in] input Input tensor. Data types supported: All * @param[out] output Output tensor. Data type supported: same as @p input * @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary * information to reshape the input tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false) + * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true) + * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false), (only 4, 8 and 16 if rhs_info.export_to_cl_image == true) * rhs_info.h0: greater than 0 * rhs_info.transpose: true, false * rhs_info.interleave: true, false */ void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMReshapeRHSMatrixKernel + * + * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor, + * required to create a OpenCL image object from buffer in @ref CLGEMMMatrixMultiplyReshapedKernel and in @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel + * Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required: + * -# rhs_info.n0 can only be 4, 8 and 16 + * -# rhs_info.k0 can only be 4, 8 and 16 + * -# Data type can only be F32 + * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension + * -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4) + * -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT + * -# The output tensor should be only consumed by @ref CLGEMMMatrixMultiplyReshapedKernel or @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel * * @param[in] input Input tensor info. Data types supported: All * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input. * @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary * information to reshape the input tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false) + * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true) + * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false),(only 4, 8 and 16 if rhs_info.export_to_cl_image == true) * rhs_info.h0: greater than 0 * rhs_info.transpose: true, false * rhs_info.interleave: true, false diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index 4e73edba4b..d151496537 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -1912,8 +1912,8 @@ struct GEMMLHSMatrixInfo struct GEMMRHSMatrixInfo { GEMMRHSMatrixInfo() = default; - GEMMRHSMatrixInfo(unsigned int n, unsigned int k, unsigned int h, bool trans, bool inter) - : n0(n), k0(k), h0(h), transpose(trans), interleave(inter) + GEMMRHSMatrixInfo(unsigned int n, unsigned int k, unsigned int h, bool trans, bool inter, bool export_to_cl_img) + : n0(n), k0(k), h0(h), transpose(trans), interleave(inter), export_to_cl_image(export_to_cl_img) { } unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */ -- cgit v1.2.1