From e3a849af3d9e108704c6ce162f377398300d990d Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Wed, 10 Jun 2020 17:59:30 +0100 Subject: COMPMID-3320: Add cl_image support for GEMMReshaped T_NT COMPMID-3321: Add cl_image support for GEMMReshaped NT_T - Added support for cl_image in CLGEMMMatrixMultiplyReshapedKernel (both NT and T kernels) - Extended the tests for the validating rhs_info.export_to_cl_image = true - Added utility macros in OpenCL to load data from a OpenCL image object - Updated doxygen documentation in CLGEMMMatrixMultiplyReshapedKernel.h - Updated doxygen documentation in CLGEMMReshapeRHSMatrixKernel.h Change-Id: I953b10e4ef205d1b76dcbc366e5a91fd5a8e1d5c Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3329 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas --- .../kernels/CLGEMMMatrixMultiplyReshapedKernel.h | 59 +- .../core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h | 45 +- arm_compute/core/Types.h | 4 +- src/core/CL/CLKernelLibrary.cpp | 2 + src/core/CL/cl_kernels/gemm.cl | 641 ++++++++++++++++++++- src/core/CL/cl_kernels/gemm_helpers.h | 99 ++++ src/core/CL/cl_kernels/helpers.h | 43 ++ .../kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp | 100 +++- .../CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp | 10 +- tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp | 330 ++++++++++- tests/validation/CL/GEMMReshapeRHSMatrix.cpp | 2 +- tests/validation/fixtures/GEMMFixture.h | 29 +- 12 files changed, 1273 insertions(+), 91 deletions(-) diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h index ee8e57fa8c..aeedd50e0b 100644 --- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h @@ -55,19 +55,30 @@ public: * Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the * multiplications. i.e. float c = (half)a * (half)b * - * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4 + * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function. + * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer, + * the following conditions are required: + * -# rhs_info.n0 can only be 4, 8 and 16 + * -# rhs_info.k0 can only be 4, 8 and 16 + * -# Data type can only be F32 + * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension + * -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement + * -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4) + * -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT + * + * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4 * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3 * @param[in] input2 Input tensor containing the bias matrix. Data type supported: same as @p input0. * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0 * @param[in] alpha Weight of the matrix product * @param[in] beta Weight of the matrix bias - * @param[in] lhs_info LHS matrix information used for reshaping the input0 tensor. Only the following values are supported: + * @param[in] lhs_info LHS matrix information used for reshaping the input0 tensor. Only the following values are supported: * lhs_info.m0: 2,3,4,5,6,7,8 * lhs_info.k0: 2,3,4,8,16 * lhs_info.transpose: false * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: 2,3,4,8,16 + * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true) + * rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true) * rhs_info.transpose: true * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices * @@ -82,8 +93,19 @@ public: * Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the * multiplications. i.e. float c = (half)a * (half)b * + * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function. + * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer, + * the following conditions are required: + * -# rhs_info.n0 can only be 4, 8 and 16 + * -# rhs_info.k0 can only be 4, 8 and 16 + * -# Data type can only be F32 + * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension + * -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement + * -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4) + * -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT + * * @param[in] compile_context The compile context to be used. - * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4 + * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4 * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3 * @param[in] input2 Input tensor containing the bias matrix. Data type supported: same as @p input0. * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0 @@ -94,8 +116,8 @@ public: * lhs_info.k0: 2,3,4,8,16 * lhs_info.transpose: false * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: 2,3,4,8,16 + * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true) + * rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true) * rhs_info.transpose: true * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices * @@ -107,7 +129,22 @@ public: const GEMMKernelInfo &gemm_info); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyReshapedKernel * - * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4 + * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag. + * Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the + * multiplications. i.e. float c = (half)a * (half)b + * + * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function. + * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer, + * the following conditions are required: + * -# rhs_info.n0 can only be 4, 8 and 16 + * -# rhs_info.k0 can only be 4, 8 and 16 + * -# Data type can only be F32 + * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension + * -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement + * -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4) + * -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT + * + * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4 * @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3 * @param[in] input2 Input tensor info containing the bias matrix. Data type supported: same as @p input0. * @param[in] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0 @@ -118,8 +155,8 @@ public: * lhs_info.k0: 2,3,4,8,16 * lhs_info.transpose: false * @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: 2,3,4,8,16 + * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true) + * rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true) * rhs_info.transpose: true * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices * @@ -141,10 +178,10 @@ private: ICLTensor *_output; bool _slide_matrix_b; bool _reinterpret_output_as_3d; - unsigned int _k; bool _use_dummy_work_items; bool _add_bias; bool _broadcast_bias; + bool _export_to_cl_image; }; } // namespace arm_compute #endif /*ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H*/ \ No newline at end of file diff --git a/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h b/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h index 0e6352bdbb..59a4aaa912 100644 --- a/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h @@ -47,40 +47,73 @@ public: /** Allow instances of this class to be moved */ CLGEMMReshapeRHSMatrixKernel &operator=(CLGEMMReshapeRHSMatrixKernel &&) = default; /** Initialise the kernel's input and output. + * + * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor, + * required to create a OpenCL image object from buffer in @ref CLGEMMMatrixMultiplyReshapedKernel and in @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel + * Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required: + * -# rhs_info.n0 can only be 4, 8 and 16 + * -# rhs_info.k0 can only be 4, 8 and 16 + * -# Data type can only be F32 + * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension + * -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4) + * -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT + * -# The output tensor should be only consumed by @ref CLGEMMMatrixMultiplyReshapedKernel or @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel * * @param[in] input Input tensor. Data types supported: All * @param[out] output Output tensor. Data type supported: same as @p input * @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary * information to reshape the input tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false) + * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true) + * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false), (only 4, 8 and 16 if rhs_info.export_to_cl_image == true) * rhs_info.h0: greater than 0 * rhs_info.transpose: true, false * rhs_info.interleave: true, false */ void configure(const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info); /** Initialise the kernel's input and output. + * + * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor, + * required to create a OpenCL image object from buffer in @ref CLGEMMMatrixMultiplyReshapedKernel and in @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel + * Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required: + * -# rhs_info.n0 can only be 4, 8 and 16 + * -# rhs_info.k0 can only be 4, 8 and 16 + * -# Data type can only be F32 + * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension + * -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4) + * -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT + * -# The output tensor should be only consumed by @ref CLGEMMMatrixMultiplyReshapedKernel or @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel * * @param[in] compile_context The compile context to be used. * @param[in] input Input tensor. Data types supported: All * @param[out] output Output tensor. Data type supported: same as @p input * @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary * information to reshape the input tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false) + * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true) + * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false), (only 4, 8 and 16 if rhs_info.export_to_cl_image == true) * rhs_info.h0: greater than 0 * rhs_info.transpose: true, false * rhs_info.interleave: true, false */ void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMReshapeRHSMatrixKernel + * + * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor, + * required to create a OpenCL image object from buffer in @ref CLGEMMMatrixMultiplyReshapedKernel and in @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel + * Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required: + * -# rhs_info.n0 can only be 4, 8 and 16 + * -# rhs_info.k0 can only be 4, 8 and 16 + * -# Data type can only be F32 + * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension + * -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4) + * -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT + * -# The output tensor should be only consumed by @ref CLGEMMMatrixMultiplyReshapedKernel or @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel * * @param[in] input Input tensor info. Data types supported: All * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input. * @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary * information to reshape the input tensor. Only the following values are supported: - * rhs_info.n0: 2,3,4,8,16 - * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false) + * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true) + * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false),(only 4, 8 and 16 if rhs_info.export_to_cl_image == true) * rhs_info.h0: greater than 0 * rhs_info.transpose: true, false * rhs_info.interleave: true, false diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index 4e73edba4b..d151496537 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -1912,8 +1912,8 @@ struct GEMMLHSMatrixInfo struct GEMMRHSMatrixInfo { GEMMRHSMatrixInfo() = default; - GEMMRHSMatrixInfo(unsigned int n, unsigned int k, unsigned int h, bool trans, bool inter) - : n0(n), k0(k), h0(h), transpose(trans), interleave(inter) + GEMMRHSMatrixInfo(unsigned int n, unsigned int k, unsigned int h, bool trans, bool inter, bool export_to_cl_img) + : n0(n), k0(k), h0(h), transpose(trans), interleave(inter), export_to_cl_image(export_to_cl_img) { } unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */ diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index d4073c6f30..5efc4683a2 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -220,7 +220,9 @@ const std::map CLKernelLibrary::_kernel_program_map = { "gemm_mm_floating_point_f32_bifrost_1000", "gemm.cl" }, { "gemm_mm_native", "gemm.cl" }, { "gemm_mm_reshaped_lhs_nt_rhs_t", "gemm.cl" }, + { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "gemm.cl" }, { "gemm_mm_reshaped_lhs_t_rhs_nt", "gemm.cl" }, + { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "gemm.cl" }, { "gemm_mm_reshaped_only_rhs_nt", "gemm.cl" }, { "gemm_mm_reshaped_only_rhs_t", "gemm.cl" }, { "gemm_lc_vm_f32", "gemm.cl" }, diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl index 8a956010e7..e575cf6deb 100644 --- a/src/core/CL/cl_kernels/gemm.cl +++ b/src/core/CL/cl_kernels/gemm.cl @@ -1859,7 +1859,7 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs), * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float) * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. - * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (e.g. -DM=52 and -DN=90). + * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24). * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4). * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2) * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2) @@ -1904,7 +1904,6 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs), * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix - * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped. * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes) * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes) * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes) @@ -1917,7 +1916,6 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(bias), #endif // defined(BETA) IMAGE_DECLARATION(dst), - uint k, uint lhs_stride_z, uint rhs_stride_z, #if defined(BETA) @@ -1984,7 +1982,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0; REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); - for(int i = 0; i < k; i += K0) + for(int i = 0; i < K; i += K0) { // Supported cases (M0, K0): // 1,2 - 1,3 - 1,4 - 1,8 - 1,16 @@ -2114,8 +2112,271 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), #undef RHS_BLOCK_SIZE #undef RHS_OFFSET_X #undef RHS_STEP_X +#undef LHS_STEP_LOOP +#undef RHS_STEP_LOOP } +#if defined(OPENCL_IMAGE_SUPPORT) +/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object. + * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed + * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed + * + * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float) + * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float) + * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float + * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. + * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24). + * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4). + * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2) + * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2) + * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time. + * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time. + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 2, 3, 4, 5, 6, 7, 8 + * - N0 = 4, 8, 16 + * - K0 = 4, 8, 16 + * - V0 >= 1 + * - H0 >= 1 + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively. + * The activation function is performed after the bias addition + * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time: + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped + * + * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32 + * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes) + * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes) + * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix + * @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr + * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr + * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes) + * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes) + * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes) + * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes) + * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs), + __read_only image2d_t rhs_img, +#if defined(BETA) + IMAGE_DECLARATION(bias), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), + uint lhs_stride_z, + uint rhs_stride_z, +#if defined(BETA) + uint bias_stride_z, +#endif //defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D + ) +{ + // Pixel unit +#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0) + + // Block size +#define LHS_BLOCK_SIZE ((K0) * (M0)) + +#if defined(LHS_INTERLEAVE) +#define LHS_OFFSET_X (K0) +#define LHS_STEP_X ((K0) * (V0)) +#define LHS_STEP_LOOP (1) +#else // defined(INTERLEAVE) +#define LHS_OFFSET_X (LHS_BLOCK_SIZE) +#define LHS_STEP_X (K0) +#define LHS_STEP_LOOP (V0) +#endif // defined(INTERLEAVE) + + // Block size +#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (PIXEL_UNIT) +#define RHS_STEP_X (PIXEL_UNIT * (H0)) +#define RHS_STEP_LOOP (1) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X PIXEL_UNIT +#define RHS_STEP_LOOP (H0) +#endif // defined(RHS_INTERLEAVE) + +#if defined(DUMMY_WORK_ITEMS) + if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y + + (get_global_id(2) * lhs_stride_z); + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH); +#else // defined(MATRIX_B_DEPTH) + const uint z_rhs = get_global_id(2); +#endif // defined(MATRIX_B_DEPTH) + + // Compute RHS matrix coordinates + uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X; + const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT; + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0; + REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); + + for(int i = 0; i < K; i += K0) + { + // Load values from LHS matrix + LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs); + + // Load values from RHS matrix stored in a cl_image + REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0); + LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0); + + // Accumulate + ARM_DOT_K0XN0(a0, b, c0); +#if M0 > 1 + ARM_DOT_K0XN0(a1, b, c1); +#endif // M0 > 1 +#if M0 > 2 + ARM_DOT_K0XN0(a2, b, c2); +#endif // M0 > 2 +#if M0 > 3 + ARM_DOT_K0XN0(a3, b, c3); +#endif // M0 > 3 +#if M0 > 4 + ARM_DOT_K0XN0(a4, b, c4); +#endif // M0 > 4 +#if M0 > 5 + ARM_DOT_K0XN0(a5, b, c5); +#endif // M0 > 5 +#if M0 > 6 + ARM_DOT_K0XN0(a6, b, c6); +#endif // M0 > 6 +#if M0 > 7 + ARM_DOT_K0XN0(a7, b, c7); +#endif // M0 > 7 + + lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE); + + x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP; + } + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); + +#if defined(REINTERPRET_OUTPUT_AS_3D) + + // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += get_global_id(2) * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) +#if defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)); + + LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] +#if defined(MIXED_PRECISION) + CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); + ADD_BLOCK_BROADCAST(M0, c, bias_hp0); +#else // defined(MIXED_PRECISION) + ADD_BLOCK_BROADCAST(M0, c, bias0); +#endif // defined(MIXED_PRECISION) + +#else // defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id( + 2) * bias_stride_z; + + LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias +#if defined(MIXED_PRECISION) + CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); + ADD_BLOCK(M0, c, bias_hp); +#else // defined(MIXED_PRECISION) + ADD_BLOCK(M0, c, bias); +#endif // defined(MIXED_PRECISION) + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) +#if defined(MIXED_PRECISION) + ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL); +#else // defined(MIXED_PRECISION) + ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL); +#endif // defined(MIXED_PRECISION) +#endif // defined(ACTIVATION_TYPE) + + // Store output block +#if defined(MIXED_PRECISION) + CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); +#else // defined(MIXED_PRECISION) + STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); +#endif // defined(MIXED_PRECISION) + +#undef LHS_BLOCK_SIZE +#undef LHS_OFFSET_X +#undef LHS_STEP_X +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +#undef PIXEL_UNIT +#undef LHS_STEP_LOOP +#undef RHS_STEP_LOOP +} +#endif // defined(OPENCL_IMAGE_SUPPORT) + #if defined(LHS_TRANSPOSE) #define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE) @@ -2232,7 +2493,7 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), * * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE). * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. - * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (e.g. -DM=52 and -DN=90). + * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24). * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4). * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2) * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2) @@ -2277,7 +2538,6 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix - * @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped. * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes) * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes) * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes) @@ -2290,7 +2550,6 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(bias), #endif // defined(BETA) IMAGE_DECLARATION(dst), - uint k, uint lhs_stride_z, uint rhs_stride_z, #if defined(BETA) @@ -2360,11 +2619,14 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs), __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr); __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr); - for(int i = 0; i < k; i += K0) + for(int i = 0; i < K; i += K0) { VEC_DATA_TYPE(DATA_TYPE, M0) - a0 = VLOAD(M0)(0, lhs); + a0; VEC_DATA_TYPE(DATA_TYPE, N0) + b0; + + a0 = VLOAD(M0)(0, lhs); b0 = VLOAD(N0)(0, rhs); ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); @@ -2596,6 +2858,367 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs), #undef RHS_STEP_X } +#if defined(OPENCL_IMAGE_SUPPORT) +/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object. + * The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed + * The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed + * + * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel + * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE). + * @note The height of the RHS matrix should be passed at compile time using -DRHS_HEIGHT= (e.g. -DRHS_HEIGHT=32) + * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time. + * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24). + * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4). + * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2) + * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2) + * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time. + * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time. + * @note Only the following configurations of M0, N0 and K0 are currently supported: + * - M0 = 2, 3, 4, 8 + * - N0 = 4, 8, 16 + * - K0 = 4, 8, 16 + * - V0 >= 1 + * - H0 >= 1 + * + * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively. + * The activation function is performed after the bias addition + * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time: + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped + * + * @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32 + * @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes) + * @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes) + * @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix + * @param[in] rhs_img The RHS reshaped matrix as cl_image 2d. Supported data type: same as @p lhs_ptr + * @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr + * @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes) + * @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes) + * @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + * @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes) + * @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes) + * @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs), + __read_only image2d_t rhs_img, +#if defined(BETA) + IMAGE_DECLARATION(bias), +#endif // defined(BETA) + IMAGE_DECLARATION(dst), + uint lhs_stride_z, + uint rhs_stride_z, +#if defined(BETA) + uint bias_stride_z, +#endif //defined(BETA) + uint dst_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint dst_cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D + ) +{ + // Pixel unit +#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0) + + // Block size +#define LHS_BLOCK_SIZE ((K0) * (M0)) + +#if defined(LHS_INTERLEAVE) +#define LHS_OFFSET_X (M0) +#define LHS_STEP_X ((M0) * (V0)) +#define LHS_STEP_LOOP (1) +#else // defined(INTERLEAVE) +#define LHS_OFFSET_X (LHS_BLOCK_SIZE) +#define LHS_STEP_X (M0) +#define LHS_STEP_LOOP (V0) +#endif // defined(INTERLEAVE) + + // Block size +#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT)) + + // RHS offset and step X +#if defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (PIXEL_UNIT) +#define RHS_STEP_X ((PIXEL_UNIT) * (H0)) +#else // defined(RHS_INTERLEAVE) +#define RHS_OFFSET_X (RHS_BLOCK_SIZE) +#define RHS_STEP_X (PIXEL_UNIT) +#endif // defined(RHS_INTERLEAVE) + + const uint x = get_global_id(0); + const uint y = get_global_id(1); + const uint z = get_global_id(2); + +#if defined(DUMMY_WORK_ITEMS) + if((x * N0 >= N) || (y * M0 >= M)) + { + return; + } +#endif // defined(DUMMY_WORK_ITEMS) + + // Compute LHS matrix address + __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z); + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + const uint z_rhs = (z % MATRIX_B_DEPTH); +#else // defined(MATRIX_B_DEPTH) + const uint z_rhs = z; +#endif // defined(MATRIX_B_DEPTH) + + // Compute RHS matrix coordinates + uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X; + const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT; + + // Initialize the accumulators + REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0); + + __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr); + + for(int i = 0; i < K; i += K0) + { + VEC_DATA_TYPE(DATA_TYPE, M0) + a0; + VEC_DATA_TYPE(DATA_TYPE, N0) + b0; + + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + +#if K0 > 1 + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; +#endif // K0 > 1 + +#if K0 > 2 + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; +#endif // K0 > 2 + +#if K0 > 3 + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; +#endif // K0 > 3 + +#if K0 > 4 + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; +#endif // K0 > 4 + +#if K0 > 8 + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; + + a0 = VLOAD(M0)(0, lhs); + b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs)); + + ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c); + + lhs += LHS_STEP_X; +#endif // K0 > 8 + +#ifndef LHS_INTERLEAVE + lhs += (M0 * K0 * (V0 - 1)); +#endif // LHS_INTERLEAVE + + x_rhs += K0 * RHS_STEP_X; +#ifndef RHS_INTERLEAVE + x_rhs += (PIXEL_UNIT * K0 * (H0 - 1)); +#endif // RHS_INTERLEAVE + } + + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y); + + REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); + +#if defined(REINTERPRET_OUTPUT_AS_3D) + + // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D + CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y); + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; + +#endif // defined(REINTERPRET_OUTPUT_AS_3D) + + // Multiply by the weight of matrix-matrix product and store the result +#if defined(ALPHA) + SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA); +#endif // defined(ALPHA) + + // Add beta*bias +#if defined(BETA) +#if defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)); + + LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(1, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + + // c = c + bias[broadcasted] +#if defined(MIXED_PRECISION) + CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); + ADD_BLOCK_BROADCAST(M0, c, bias_hp0); +#else // defined(MIXED_PRECISION) + ADD_BLOCK_BROADCAST(M0, c, bias0); +#endif // defined(MIXED_PRECISION) + +#else // defined(BROADCAST_BIAS) + __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z; + + LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero); + +#ifndef UNIT_BETA + SCALE_BLOCK(M0, DATA_TYPE, bias, BETA); +#endif // UNIT_BIAS + +#if defined(MIXED_PRECISION) + CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp); + ADD_BLOCK(M0, c, bias_hp); +#else // defined(MIXED_PRECISION) + ADD_BLOCK(M0, c, bias); +#endif // defined(MIXED_PRECISION) + +#endif // defined(BROADCAST_BIAS) +#endif // defined(BETA) + +#if defined(ACTIVATION_TYPE) +#if defined(MIXED_PRECISION) + ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL); +#else // defined(MIXED_PRECISION) + ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL); +#endif // defined(MIXED_PRECISION) +#endif // defined(ACTIVATION_TYPE) + + // Store output block +#if defined(MIXED_PRECISION) + CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); +#else // defined(MIXED_PRECISION) + STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout); +#endif // defined(MIXED_PRECISION) + +#undef LHS_BLOCK_SIZE +#undef LHS_OFFSET_X +#undef LHS_STEP_X +#undef RHS_BLOCK_SIZE +#undef RHS_OFFSET_X +#undef RHS_STEP_X +#undef PIXEL_UNIT +#undef LHS_STEP_LOOP +#undef RHS_STEP_LOOP +} +#endif // defined(OPENCL_IMAGE_SUPPORT) + #endif // defined(LHS_TRANSPOSE) #endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE) diff --git a/src/core/CL/cl_kernels/gemm_helpers.h b/src/core/CL/cl_kernels/gemm_helpers.h index af43477bd4..d5a7cfbb0a 100644 --- a/src/core/CL/cl_kernels/gemm_helpers.h +++ b/src/core/CL/cl_kernels/gemm_helpers.h @@ -140,6 +140,105 @@ #define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) /** @} */ // end of group LOAD_BLOCK +/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1). + * @name LOAD_TEXTURE2D_ROW_n + * + * @param[in] N0 The number of pixels to read + * @param[in] DATA_TYPE The data type of variables + * @param[in] BASENAME The basename of the destination variables for the loaded rows + * @param[in] IMG The 2D OpenCL image object + * @param[in] X_COORD The x coordinate for the top-left pixel + * @param[in] Y_COORD The y coordinate for the top-left pixel + * @param[in] X_STEP_ROW The incremental step row for the x coordinate (in pixels) + * @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels) + * @{ + */ +#define LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + BASENAME##0 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 0 * X_STEP_ROW), (Y_COORD + 0 * Y_STEP_ROW)) + +#define LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + BASENAME##1 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 1 * X_STEP_ROW), (Y_COORD + 1 * Y_STEP_ROW)) + +#define LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + BASENAME##2 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 2 * X_STEP_ROW), (Y_COORD + 2 * Y_STEP_ROW)) + +#define LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + BASENAME##3 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 3 * X_STEP_ROW), (Y_COORD + 3 * Y_STEP_ROW)) + +#define LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + BASENAME##4 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 4 * X_STEP_ROW), (Y_COORD + 4 * Y_STEP_ROW)) + +#define LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + BASENAME##5 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 5 * X_STEP_ROW), (Y_COORD + 5 * Y_STEP_ROW)) + +#define LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + BASENAME##6 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 6 * X_STEP_ROW), (Y_COORD + 6 * Y_STEP_ROW)) + +#define LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + BASENAME##7 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 7 * X_STEP_ROW), (Y_COORD + 7 * Y_STEP_ROW)) + +#define LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + BASENAME##8 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 8 * X_STEP_ROW), (Y_COORD + 8 * Y_STEP_ROW)) + +#define LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + BASENAME##9 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 9 * X_STEP_ROW), (Y_COORD + 9 * Y_STEP_ROW)) + +#define LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + BASENAME##A = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 10 * X_STEP_ROW), (Y_COORD + 10 * Y_STEP_ROW)) + +#define LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + BASENAME##B = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 11 * X_STEP_ROW), (Y_COORD + 11 * Y_STEP_ROW)) + +#define LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + BASENAME##C = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 12 * X_STEP_ROW), (Y_COORD + 12 * Y_STEP_ROW)) + +#define LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + BASENAME##D = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 13 * X_STEP_ROW), (Y_COORD + 13 * Y_STEP_ROW)) + +#define LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + BASENAME##E = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 14 * X_STEP_ROW), (Y_COORD + 14 * Y_STEP_ROW)) + +#define LOAD_TEXTURE2D_ROW_16(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \ + BASENAME##F = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 15 * X_STEP_ROW), (Y_COORD + 15 * Y_STEP_ROW)) +/** @} */ // end of group LOAD_TEXTURE2D_ROW_n + +/** Load a 2D texture in unit of pixel. A pixel is made of 4 floating point values + * @name LOAD_TEXTURE2D + * + * Supported cases are M0=1,2,3,...,16 and N0=1 + * The data to load is expected to have consecutive names for each row. + * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2. + * + * @param[in] M0 The number of consecutive rows + * @param[in] N0 The number of consecutive pixels. Only 1, 2 and 4 are supported + * @param[in] DATA_TYPE The data type of the target + * @param[in] BASENAME The basename of the result variables + * @param[in] IMG The 2D OpenCL image object + * @param[in] X_COORD The x coordinate for the top-left pixel + * @param[in] Y_COORD The y coordinate for the top-left pixel + * @param[in] X_STEP_ROW The incremental step row for the x coordinate (in pixels) + * @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels) + * @{ + */ +#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) +#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) +/** @} */ // end of group LOAD_TEXTURE2D + /** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1). * @name LOAD_ELEMENT_n * diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h index c4cbf77e96..0cf726f7f2 100644 --- a/src/core/CL/cl_kernels/helpers.h +++ b/src/core/CL/cl_kernels/helpers.h @@ -194,6 +194,49 @@ #define VLOAD_STR(size) vload##size #define VLOAD(size) VLOAD_STR(size) +#define PIXEL_UNIT4 1 +#define PIXEL_UNIT8 2 +#define PIXEL_UNIT16 4 + +/** Utility macro to convert a vector size in pixel unit. + * + * @name CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT + * + * @param[in] vec_size Vector size. Only 4,8 and 16 is supported + * + * @return The pixel unit (number of pixels) + * @{ + */ +#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size +#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) +/** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT + +#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord))); +#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord))); +#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord))); + +#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) +#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord))); +#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord))); +#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord))); +#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) + +/** Utility macro to read a 2D OpenCL image object. + * + * @note Coordinates are not normalized + * + * @param[in] data_type Data type + * @param[in] n0 Number of pixel to read. Only 1,2 and 4 is supported + * @param[in] img OpenCL image object + * @param[in] x_coord The x coordinate for the top-left pixel + * @param[in] y_coord The y coordinate for the top-left pixel + * + * @return Pixels from the 2D OpenCL image object + * @{ + */ +#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord) +#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) + #define VSTORE_STR(size) vstore##size #define VSTORE(size) VSTORE_STR(size) diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp index 09e4e98a87..ba1c8a9d14 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp @@ -79,6 +79,23 @@ Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, "Bias addition only supported with broadcast mode in case the input or output has to be reinterpreted as 3D"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (input0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type"); + if(rhs_info.export_to_cl_image) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.n0 == 2) || (rhs_info.n0 == 3), "Export to cl_image only supported with n0 = 4, 8 or 16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.k0 == 2) || (rhs_info.k0 == 3), "Export to cl_image only supported with k0 = 4, 8 or 16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->data_type() != DataType::F32, "Export to cl_image only supported with F32 data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, "Impossible to retrieve the cl_image pitch alignment"); + + // Check the width and height of the output tensor. + // Since we cannot create a 3d image from a buffer, the third dimension is collapsed with the second dimension + size_t max_image_w = CLKernelLibrary::get().get_device().getInfo(); + size_t max_image_h = CLKernelLibrary::get().get_device().getInfo(); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->tensor_shape()[0] > max_image_w * 4, "Not supported width for cl_image"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->tensor_shape()[1] * input1->tensor_shape()[2] > max_image_h, "Not supported height for cl_image"); + } + const unsigned int m = gemm_info.m; const unsigned int n = gemm_info.n; const unsigned int k = gemm_info.k; @@ -207,8 +224,8 @@ std::pair validate_and_configure_window(ITensorInfo *input0, ITe } // namespace CLGEMMMatrixMultiplyReshapedKernel::CLGEMMMatrixMultiplyReshapedKernel() - : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1), _use_dummy_work_items(false), _add_bias(false), - _broadcast_bias(false) + : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _use_dummy_work_items(false), _add_bias(false), + _broadcast_bias(false), _export_to_cl_image(false) { } @@ -233,10 +250,10 @@ void CLGEMMMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi _input2 = helpers::float_ops::is_zero(beta) ? nullptr : input2; _output = output; _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; - _k = gemm_info.k; _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); _add_bias = _input2 != nullptr; _broadcast_bias = gemm_info.broadcast_bias; + _export_to_cl_image = rhs_info.export_to_cl_image; // Check if we need to slide the matrix B const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions(); @@ -270,10 +287,13 @@ void CLGEMMMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); build_opts.add_option_if(enable_mixed_precision, "-DMIXED_PRECISION"); + build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT"); + build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(input1->info()->dimension(1))); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision ? get_cl_type_from_data_type(DataType::F32) : get_cl_type_from_data_type(data_type))); build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m)); build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n)); + build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k)); build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0)); build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0)); @@ -283,6 +303,7 @@ void CLGEMMMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi std::string kernel_name("gemm_mm_reshaped_"); kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_"; kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt"; + kernel_name += rhs_info.export_to_cl_image ? "_texture" : ""; // Create kernel _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); @@ -356,20 +377,31 @@ void CLGEMMMatrixMultiplyReshapedKernel::run(const Window &window, cl::CommandQu slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - if(_reinterpret_output_as_3d) + const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom; + + cl_mem cl_image; + cl_int err = CL_SUCCESS; + cl::Image2D input1_image2d; + + if(_export_to_cl_image) { - // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor - unsigned int idx0; - if(_add_bias) - { - idx0 = 4 * num_arguments_per_2D_tensor() + 5; - } - else - { - idx0 = 3 * num_arguments_per_2D_tensor() + 4; - } - const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); + // Create OpenCL image object from OpenCL buffer + const cl_image_format format = { CL_RGBA, CL_FLOAT }; + + cl_image_desc desc; + memset(&desc, 0, sizeof(desc)); + desc.image_type = CL_MEM_OBJECT_IMAGE2D; + desc.mem_object = _input1->cl_buffer()(); + desc.image_row_pitch = _input1->info()->strides_in_bytes()[1]; + desc.image_width = _input1->info()->dimension(0) / 4; + desc.image_height = _input1->info()->dimension(1) * _input1->info()->dimension(2); + + cl_image = clCreateImage(CLKernelLibrary::get().context()(), CL_MEM_READ_ONLY, &format, &desc, nullptr, &err); + + ARM_COMPUTE_UNUSED(err); + ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Error during the creation of CL image from buffer"); + + input1_image2d = cl::Image2D(cl_image); } do @@ -383,18 +415,48 @@ void CLGEMMMatrixMultiplyReshapedKernel::run(const Window &window, cl::CommandQu } unsigned int idx = 0; + + // LHS buffer add_2D_tensor_argument(idx, _input0, slice); - add_2D_tensor_argument(idx, _input1, slice_b); - add_2D_tensor_argument_if((_add_bias), idx, _input2, slice); + + // RHS buffer or RHS OpenCL image (_export_to_cl_image == true) + if(_export_to_cl_image) + { + _kernel.setArg(idx++, input1_image2d); + } + else + { + add_2D_tensor_argument(idx, _input1, slice_b); + } + + // Bias buffer (_add_bias == true) + add_2D_tensor_argument_if(_add_bias, idx, _input2, slice); + + // Output buffer add_2D_tensor_argument(idx, _output, slice); - _kernel.setArg(idx++, static_cast(_k)); + + // LHS stride_z _kernel.setArg(idx++, static_cast(_input0->info()->strides_in_bytes()[2])); + + // RHS stride_z (not used if _export_to_cl_image == true) _kernel.setArg(idx++, static_cast(_input1->info()->strides_in_bytes()[2])); + + // Bias stride_z (if _add_bias == true) if(_add_bias) { _kernel.setArg(idx++, static_cast(_input2->info()->strides_in_bytes()[2])); } + + // Output stride_z _kernel.setArg(idx++, static_cast(_output->info()->strides_in_bytes()[2])); + + // Cross-plan padding (if _reinterpret_output_as_3d = true) + if(_reinterpret_output_as_3d) + { + _kernel.setArg(idx++, static_cast(total_cross_plane_pad)); + } + + // Dispatch kernel enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); } while(window.slide_window_slice_3D(slice)); diff --git a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp index 43e7b92c6a..00cb422199 100644 --- a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp +++ b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp @@ -54,13 +54,15 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16); ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16); ARM_COMPUTE_RETURN_ERROR_ON((rhs_info.k0 == 1) && (rhs_info.transpose)); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image && ((rhs_info.n0 != 4) || input->data_type() != DataType::F32), "Export to cl_image only supported with n0 = 4 and F32 data type"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image - && !image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image && (get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0), "Impossible to retrieve the cl_image pitch alignment"); if(rhs_info.export_to_cl_image) { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.n0 == 2) || (rhs_info.n0 == 3), "Export to cl_image only supported with n0 = 4, 8 or 16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.k0 == 2) || (rhs_info.k0 == 3), "Export to cl_image only supported with k0 = 4, 8 or 16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() != DataType::F32, "Export to cl_image only supported with F32 data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, "Impossible to retrieve the cl_image pitch alignment"); + TensorShape output_shape = compute_rhs_reshaped_shape(*input, rhs_info); // Check the width and height of the output tensor. diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp index 833a9240bf..b5b960781c 100644 --- a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp +++ b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp @@ -147,6 +147,12 @@ const auto n0_values_nightly = framework::dataset::make("N0", { 2, 3, 4, 8 }); /** K0 values to test - Nightly */ const auto k0_values_nightly = framework::dataset::make("K0", { 2, 3, 4, 8 }); +/** N0 values to test with export to OpenCL image object - Nightly */ +const auto n0_export_to_cl_image_values_nightly = framework::dataset::make("N0", { 4, 8, 16 }); + +/** K0 values to test with export to OpenCL image object - Nightly */ +const auto k0_export_to_cl_image_values_nightly = framework::dataset::make("K0", { 4, 8, 16 }); + /** V0 values to test - Nightly */ const auto v0_values_nightly = framework::dataset::make("V0", 1, 4); @@ -224,14 +230,14 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi })), framework::dataset::make("RHSMInfo",{ - GEMMRHSMatrixInfo(4,4,1,true,true), - GEMMRHSMatrixInfo(4,4,1, true,true), - GEMMRHSMatrixInfo(4,4,1,true,true), - GEMMRHSMatrixInfo(2,2,1,true,false), - GEMMRHSMatrixInfo(2,2,1,true,false), - GEMMRHSMatrixInfo(4,4,1,true,true), - GEMMRHSMatrixInfo(4,4,1,true,true), - GEMMRHSMatrixInfo(4,4,2,true,false), + GEMMRHSMatrixInfo(4,4,1,true,true,false), + GEMMRHSMatrixInfo(4,4,1,true,true,false), + GEMMRHSMatrixInfo(4,4,1,true,true,false), + GEMMRHSMatrixInfo(2,2,1,true,false,false), + GEMMRHSMatrixInfo(2,2,1,true,false,false), + GEMMRHSMatrixInfo(4,4,1,true,true,false), + GEMMRHSMatrixInfo(4,4,1,true,true,false), + GEMMRHSMatrixInfo(4,4,2,true,false,false), })), @@ -248,7 +254,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi 1 /**< Multiplication factor for the width of the 1xW transposed block */, 1 /**< Multiplication factor for the height of the 4x4 interleaved block */, GEMMLHSMatrixInfo(4,4,1,false,true), - GEMMRHSMatrixInfo(4,4,1,true,true), + GEMMRHSMatrixInfo(4,4,1,true,true,false), 0 /**< Offset to be added to each element of the matrix A */, 0 /**< Offset to be added to each element of the matrix B */), @@ -262,7 +268,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi 1 /**< Multiplication factor for the width of the 1xW transposed block */, 1 /**< Multiplication factor for the height of the 4x4 interleaved block */, GEMMLHSMatrixInfo(4,4,1,false,true), - GEMMRHSMatrixInfo(4,4,1,true,true), + GEMMRHSMatrixInfo(4,4,1,true,true,false), 0 /**< Offset to be added to each element of the matrix A */, 0 /**< Offset to be added to each element of the matrix B */), GEMMKernelInfo(), @@ -279,7 +285,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi 1 /**< Multiplication factor for the width of the 1xW transposed block */, 1 /**< Multiplication factor for the height of the 4x4 interleaved block */, GEMMLHSMatrixInfo(4,4,1,false,true), - GEMMRHSMatrixInfo(4,4,1,true,true), + GEMMRHSMatrixInfo(4,4,1,true,true,false), 0 /**< Offset to be added to each element of the matrix A */, 0 /**< Offset to be added to each element of the matrix B */), @@ -294,7 +300,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi 1 /**< Multiplication factor for the width of the 1xW transposed block */, 1 /**< Multiplication factor for the height of the 4x4 interleaved block */, GEMMLHSMatrixInfo(4,4,1,false,true), - GEMMRHSMatrixInfo(4,4,1,true,true), + GEMMRHSMatrixInfo(4,4,1,true,true,false), 0 /**< Offset to be added to each element of the matrix A */, 0 /**< Offset to be added to each element of the matrix B */), @@ -308,7 +314,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi 1 /**< Multiplication factor for the width of the 1xW transposed block */, 1 /**< Multiplication factor for the height of the 4x4 interleaved block */, GEMMLHSMatrixInfo(4,4,1,false,true), - GEMMRHSMatrixInfo(4,4,2,true,false), + GEMMRHSMatrixInfo(4,4,2,true,false,false), 0 /**< Offset to be added to each element of the matrix A */, 0 /**< Offset to be added to each element of the matrix B */), })), @@ -327,7 +333,7 @@ TEST_SUITE(Float) TEST_SUITE(FP32) FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedFixture, framework::DatasetMode::ALL, - combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( m_values, n_values), k_values), @@ -339,6 +345,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedFixture, fra h0_values_precommit), i_values_lhs), i_values_rhs), + framework::dataset::make("export_to_cl_image_rhs", false)), framework::dataset::make("DataType", DataType::F32)), a_values_precommit), beta_values_precommit), @@ -351,7 +358,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedFixture, fra } FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMMatrixMultiplyReshapedFixture, framework::DatasetMode::DISABLED, - combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( m_values, n_values), k_values), @@ -363,6 +370,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMMatrixMultiplyReshapedFixture, fra h0_values_nightly), i_values_lhs), i_values_rhs), + framework::dataset::make("export_to_cl_image_rhs", false)), framework::dataset::make("DataType", DataType::F32)), a_values_nightly), beta_values_nightly), @@ -375,7 +383,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMMatrixMultiplyReshapedFixture, fra } FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DFixture, framework::DatasetMode::ALL, - combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( m_w_values, m_h_values), n_values), @@ -388,6 +396,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DFixture, h0_values_precommit), i_values_lhs), i_values_rhs), + framework::dataset::make("export_to_cl_image_rhs", false)), framework::dataset::make("DataType", DataType::F32)), a_values_precommit), beta_values_precommit), @@ -399,7 +408,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DFixture, } FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture, framework::DatasetMode::DISABLED, - combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( m_w_values, m_h_values), n_values), @@ -412,6 +421,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture, h0_values_nightly), i_values_lhs), i_values_rhs), + framework::dataset::make("export_to_cl_image_rhs", false)), framework::dataset::make("DataType", DataType::F32)), a_values_nightly), beta_values_nightly), @@ -421,12 +431,274 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture, // Validate output validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32); } +TEST_SUITE(ExportToCLImage) +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip( + framework::dataset::make("Input0Info", { TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F32), // OK or incorrect if cl_khr_image2d_from_buffer not supported + TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F32), // OK or incorrect if cl_khr_image2d_from_buffer not supported + TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F32), // OK or incorrect if cl_khr_image2d_from_buffer not supported + TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F32), // Incorrect k0 + TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F32), // Incorrect n0 + + }), + framework::dataset::make("Input1Info",{ TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(512U, 8U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(128U, 32U, 2U), 1, DataType::F32), + + })), + framework::dataset::make("Input2Info", { TensorInfo(TensorShape(64U), 1, DataType::F32), + TensorInfo(TensorShape(64U), 1, DataType::F32), + TensorInfo(TensorShape(64U), 1, DataType::F32), + TensorInfo(TensorShape(64U), 1, DataType::F32), + TensorInfo(TensorShape(64U), 1, DataType::F32), + + })), + framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(64U, 64U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(64U, 64U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(64U, 64U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(64U, 64U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(64U, 64U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(64U, 64U, 2U), 1, DataType::F32), + + })), + framework::dataset::make("LHSMInfo",{ + GEMMLHSMatrixInfo(4, 4, 1, false, true), + GEMMLHSMatrixInfo(4, 8, 1, false, true), + GEMMLHSMatrixInfo(4, 4, 1, false, true), + GEMMLHSMatrixInfo(4, 2, 1, false, false), + GEMMLHSMatrixInfo(4, 4, 1, false, false), + + })), + framework::dataset::make("RHSMInfo",{ + GEMMRHSMatrixInfo(4, 4, 1, true, true, true), + GEMMRHSMatrixInfo(4, 8, 1, true, true, true), + GEMMRHSMatrixInfo(8, 4, 1, true, true, true), + GEMMRHSMatrixInfo(4, 2, 1, true, false, true), + GEMMRHSMatrixInfo(2, 4, 1, true, false, true), + })), + framework::dataset::make("GEMMInfo",{GEMMKernelInfo( 64 /**set_is_resizable(true), + &input1_info.clone()->set_is_resizable(true), + &input2_info.clone()->set_is_resizable(true), + &output_info.clone()->set_is_resizable(true),1.f,1.f, + lhs_info, + rhs_info, + gemm_info)) == (expected && image2d_from_buffer_supported(CLKernelLibrary::get().get_device())), framework::LogLevel::ERRORS); +} + +FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedFixture, framework::DatasetMode::ALL, + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + m_values, + n_values), + k_values), + b_values), + m0_values_precommit), + n0_values_precommit), + k0_values_precommit), + v0_values_precommit), + h0_values_precommit), + i_values_lhs), + i_values_rhs), + framework::dataset::make("export_to_cl_image_rhs", true)), + framework::dataset::make("DataType", DataType::F32)), + a_values_precommit), + beta_values_precommit), + broadcast_bias_values), + lhs_transpose_values), + act_values)) +{ + // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension + if(image2d_from_buffer_supported(CLKernelLibrary::get().get_device())) + { + validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32); + } + else + { + ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped"); + framework::ARM_COMPUTE_PRINT_INFO(); + } + +} + +FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMMatrixMultiplyReshapedFixture, framework::DatasetMode::NIGHTLY, + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + m_values, + n_values), + k_values), + b_values), + m0_values_nightly), + n0_export_to_cl_image_values_nightly), + k0_export_to_cl_image_values_nightly), + v0_values_nightly), + h0_values_nightly), + i_values_lhs), + i_values_rhs), + framework::dataset::make("export_to_cl_image_rhs", true)), + framework::dataset::make("DataType", DataType::F32)), + a_values_nightly), + beta_values_nightly), + broadcast_bias_values), + lhs_transpose_values), + act_values)) +{ + // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension + if(image2d_from_buffer_supported(CLKernelLibrary::get().get_device())) + { + validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32); + } + else + { + ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped"); + framework::ARM_COMPUTE_PRINT_INFO(); + } +} + +FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DFixture, framework::DatasetMode::ALL, + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + m_w_values, + m_h_values), + n_values), + k_values), + b_values), + m0_values_precommit), + n0_values_precommit), + k0_values_precommit), + v0_values_precommit), + h0_values_precommit), + i_values_lhs), + i_values_rhs), + framework::dataset::make("export_to_cl_image_rhs", true)), + framework::dataset::make("DataType", DataType::F32)), + a_values_precommit), + beta_values_precommit), + lhs_transpose_values), + act_values)) +{ + // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension + if(image2d_from_buffer_supported(CLKernelLibrary::get().get_device())) + { + validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32); + } + else + { + ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped"); + framework::ARM_COMPUTE_PRINT_INFO(); + } +} + +FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture, framework::DatasetMode::NIGHTLY, + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + m_w_values, + m_h_values), + n_values), + k_values), + b_values), + m0_values_nightly), + n0_export_to_cl_image_values_nightly), + k0_export_to_cl_image_values_nightly), + v0_values_nightly), + h0_values_nightly), + i_values_lhs), + i_values_rhs), + framework::dataset::make("export_to_cl_image_rhs", true)), + framework::dataset::make("DataType", DataType::F32)), + a_values_nightly), + beta_values_nightly), + lhs_transpose_values), + act_values)) +{ + // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension + if(image2d_from_buffer_supported(CLKernelLibrary::get().get_device())) + { + validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32); + } + else + { + ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped"); + framework::ARM_COMPUTE_PRINT_INFO(); + } +} +TEST_SUITE_END() // ExportToCLImage TEST_SUITE_END() // FP32 TEST_SUITE(FP16) FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedFixture, framework::DatasetMode::ALL, - combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( m_values, n_values), k_values), @@ -438,6 +710,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedFixture, fram h0_values_precommit), i_values_lhs), i_values_rhs), + framework::dataset::make("export_to_cl_image_rhs", false)), framework::dataset::make("DataType", DataType::F16)), a_values_precommit), beta_values_precommit), @@ -450,7 +723,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedFixture, fram } FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMMatrixMultiplyReshapedFixture, framework::DatasetMode::DISABLED, - combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( m_values, n_values), k_values), @@ -462,6 +735,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMMatrixMultiplyReshapedFixture, fram h0_values_nightly), i_values_lhs), i_values_rhs), + framework::dataset::make("export_to_cl_image_rhs", false)), framework::dataset::make("DataType", DataType::F16)), a_values_nightly), beta_values_nightly), @@ -474,7 +748,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMMatrixMultiplyReshapedFixture, fram } FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DFixture, framework::DatasetMode::ALL, - combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( m_w_values, m_h_values), n_values), @@ -487,6 +761,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DFixture, h0_values_precommit), i_values_lhs), i_values_rhs), + framework::dataset::make("export_to_cl_image_rhs", false)), framework::dataset::make("DataType", DataType::F16)), a_values_precommit), beta_values_precommit), @@ -498,7 +773,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DFixture, } FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture, framework::DatasetMode::DISABLED, - combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( m_w_values, m_h_values), n_values), @@ -511,6 +786,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture, h0_values_nightly), i_values_lhs), i_values_rhs), + framework::dataset::make("export_to_cl_image_rhs", false)), framework::dataset::make("DataType", DataType::F16)), a_values_nightly), beta_values_nightly), @@ -525,7 +801,7 @@ TEST_SUITE_END() // FP16 TEST_SUITE(MixedPrecision) FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedMixedPrecisionFixture, framework::DatasetMode::ALL, - combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( m_values, n_values), k_values), @@ -537,6 +813,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedMixedPrecisionFixtu h0_values_precommit), i_values_lhs), i_values_rhs), + framework::dataset::make("export_to_cl_image_rhs", false)), framework::dataset::make("DataType", DataType::F16)), a_values_precommit), beta_values_precommit), @@ -549,7 +826,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedMixedPrecisionFixtu } FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMMatrixMultiplyReshapedMixedPrecisionFixture, framework::DatasetMode::DISABLED, - combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( m_values, n_values), k_values), @@ -561,6 +838,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMMatrixMultiplyReshapedMixedPrecisionFixtu h0_values_nightly), i_values_lhs), i_values_rhs), + framework::dataset::make("export_to_cl_image_rhs", false)), framework::dataset::make("DataType", DataType::F16)), a_values_nightly), beta_values_nightly), @@ -573,7 +851,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMMatrixMultiplyReshapedMixedPrecisionFixtu } FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DMixedPrecisionFixture, framework::DatasetMode::ALL, - combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( m_w_values, m_h_values), n_values), @@ -586,6 +864,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DMixedPrecisionF h0_values_precommit), i_values_lhs), i_values_rhs), + framework::dataset::make("export_to_cl_image_rhs", false)), framework::dataset::make("DataType", DataType::F16)), a_values_precommit), beta_values_precommit), @@ -597,7 +876,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DMixedPrecisionF } FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DMixedPrecisionFixture, framework::DatasetMode::DISABLED, - combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( m_w_values, m_h_values), n_values), @@ -610,6 +889,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DMixedPrecisionF h0_values_nightly), i_values_lhs), i_values_rhs), + framework::dataset::make("export_to_cl_image_rhs", false)), framework::dataset::make("DataType", DataType::F16)), a_values_nightly), beta_values_nightly), diff --git a/tests/validation/CL/GEMMReshapeRHSMatrix.cpp b/tests/validation/CL/GEMMReshapeRHSMatrix.cpp index aa6667666c..9b956b9cc5 100644 --- a/tests/validation/CL/GEMMReshapeRHSMatrix.cpp +++ b/tests/validation/CL/GEMMReshapeRHSMatrix.cpp @@ -129,7 +129,7 @@ DATA_TEST_CASE(ValidatePadding, framework::DatasetMode::ALL, combine(combine(com TensorShape(32U, 16U, 2U) }), framework::dataset::make("N0",{ 4 })), - framework::dataset::make("K0",{ 2, 4, 8 })), + framework::dataset::make("K0",{ 4, 8, 16 })), framework::dataset::make("H0",{ 1, 2, 4 })), input_shape, n0, k0, h0) { diff --git a/tests/validation/fixtures/GEMMFixture.h b/tests/validation/fixtures/GEMMFixture.h index efe7567075..b2adf2dfc0 100644 --- a/tests/validation/fixtures/GEMMFixture.h +++ b/tests/validation/fixtures/GEMMFixture.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -678,7 +678,7 @@ class GEMMMatrixMultiplyReshapedValidationFixture : public framework::Fixture public: template void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, bool interleave_lhs, - bool interleave_rhs, DataType data_type, float alpha, float beta, bool broadcast_bias, bool lhs_transpose, const ActivationLayerInfo &act_info) + bool interleave_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool broadcast_bias, bool lhs_transpose, const ActivationLayerInfo &act_info) { GEMMLHSMatrixInfo lhs_info; lhs_info.m0 = m0; @@ -688,11 +688,12 @@ public: lhs_info.transpose = lhs_transpose; GEMMRHSMatrixInfo rhs_info; - rhs_info.n0 = n0; - rhs_info.k0 = k0; - rhs_info.h0 = h0; - rhs_info.interleave = interleave_rhs; - rhs_info.transpose = !lhs_transpose; + rhs_info.n0 = n0; + rhs_info.k0 = k0; + rhs_info.h0 = h0; + rhs_info.interleave = interleave_rhs; + rhs_info.transpose = !lhs_transpose; + rhs_info.export_to_cl_image = export_to_cl_image; // Set the tensor shapes for LHS and RHS matrices const TensorShape lhs_shape(k, m, batch_size); @@ -833,8 +834,7 @@ class GEMMMatrixMultiplyReshaped3DValidationFixture : public framework::Fixture public: template void setup(unsigned int m_w, unsigned int m_h, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, - bool interleave_lhs, - bool interleave_rhs, DataType data_type, float alpha, float beta, bool lhs_transpose, const ActivationLayerInfo &act_info) + bool interleave_lhs, bool interleave_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool lhs_transpose, const ActivationLayerInfo &act_info) { GEMMLHSMatrixInfo lhs_info; lhs_info.m0 = m0; @@ -844,11 +844,12 @@ public: lhs_info.transpose = lhs_transpose; GEMMRHSMatrixInfo rhs_info; - rhs_info.n0 = n0; - rhs_info.k0 = k0; - rhs_info.h0 = h0; - rhs_info.interleave = interleave_rhs; - rhs_info.transpose = !lhs_transpose; + rhs_info.n0 = n0; + rhs_info.k0 = k0; + rhs_info.h0 = h0; + rhs_info.interleave = interleave_rhs; + rhs_info.transpose = !lhs_transpose; + rhs_info.export_to_cl_image = export_to_cl_image; // In case of GEMM3D, m is the product between m_w and m_h const unsigned int m = m_w * m_h; -- cgit v1.2.1