aboutsummaryrefslogtreecommitdiff
path: root/arm_compute
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2020-06-10 17:59:30 +0100
committerGian Marco Iodice <gianmarco.iodice@arm.com>2020-06-18 10:45:59 +0000
commite3a849af3d9e108704c6ce162f377398300d990d (patch)
treead9cb7004c64cc6747b2b04bf982b3aabef33676 /arm_compute
parent19023835fa5a73dea2823edf667c711b03bc5060 (diff)
downloadComputeLibrary-e3a849af3d9e108704c6ce162f377398300d990d.tar.gz
COMPMID-3320: Add cl_image support for GEMMReshaped T_NT
COMPMID-3321: Add cl_image support for GEMMReshaped NT_T - Added support for cl_image in CLGEMMMatrixMultiplyReshapedKernel (both NT and T kernels) - Extended the tests for the validating rhs_info.export_to_cl_image = true - Added utility macros in OpenCL to load data from a OpenCL image object - Updated doxygen documentation in CLGEMMMatrixMultiplyReshapedKernel.h - Updated doxygen documentation in CLGEMMReshapeRHSMatrixKernel.h Change-Id: I953b10e4ef205d1b76dcbc366e5a91fd5a8e1d5c Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3329 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Diffstat (limited to 'arm_compute')
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h59
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h45
-rw-r--r--arm_compute/core/Types.h4
3 files changed, 89 insertions, 19 deletions
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h
index ee8e57fa8c..aeedd50e0b 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h
@@ -55,19 +55,30 @@ public:
* Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
* multiplications. i.e. float c = (half)a * (half)b
*
- * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4
+ * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
+ * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
+ * the following conditions are required:
+ * -# rhs_info.n0 can only be 4, 8 and 16
+ * -# rhs_info.k0 can only be 4, 8 and 16
+ * -# Data type can only be F32
+ * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+ * -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
+ * -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+ * -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+ *
+ * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4
* @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3
* @param[in] input2 Input tensor containing the bias matrix. Data type supported: same as @p input0.
* @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
* @param[in] alpha Weight of the matrix product
* @param[in] beta Weight of the matrix bias
- * @param[in] lhs_info LHS matrix information used for reshaping the input0 tensor. Only the following values are supported:
+ * @param[in] lhs_info LHS matrix information used for reshaping the input0 tensor. Only the following values are supported:
* lhs_info.m0: 2,3,4,5,6,7,8
* lhs_info.k0: 2,3,4,8,16
* lhs_info.transpose: false
* @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported:
- * rhs_info.n0: 2,3,4,8,16
- * rhs_info.k0: 2,3,4,8,16
+ * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
+ * rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
* rhs_info.transpose: true
* @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
*
@@ -82,8 +93,19 @@ public:
* Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
* multiplications. i.e. float c = (half)a * (half)b
*
+ * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
+ * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
+ * the following conditions are required:
+ * -# rhs_info.n0 can only be 4, 8 and 16
+ * -# rhs_info.k0 can only be 4, 8 and 16
+ * -# Data type can only be F32
+ * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+ * -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
+ * -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+ * -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+ *
* @param[in] compile_context The compile context to be used.
- * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4
+ * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4
* @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3
* @param[in] input2 Input tensor containing the bias matrix. Data type supported: same as @p input0.
* @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
@@ -94,8 +116,8 @@ public:
* lhs_info.k0: 2,3,4,8,16
* lhs_info.transpose: false
* @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported:
- * rhs_info.n0: 2,3,4,8,16
- * rhs_info.k0: 2,3,4,8,16
+ * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
+ * rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
* rhs_info.transpose: true
* @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
*
@@ -107,7 +129,22 @@ public:
const GEMMKernelInfo &gemm_info);
/** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyReshapedKernel
*
- * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4
+ * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag.
+ * Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
+ * multiplications. i.e. float c = (half)a * (half)b
+ *
+ * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
+ * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
+ * the following conditions are required:
+ * -# rhs_info.n0 can only be 4, 8 and 16
+ * -# rhs_info.k0 can only be 4, 8 and 16
+ * -# Data type can only be F32
+ * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+ * -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
+ * -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+ * -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+ *
+ * @param[in] input0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4
* @param[in] input1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3
* @param[in] input2 Input tensor info containing the bias matrix. Data type supported: same as @p input0.
* @param[in] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
@@ -118,8 +155,8 @@ public:
* lhs_info.k0: 2,3,4,8,16
* lhs_info.transpose: false
* @param[in] rhs_info RHS matrix information used for reshaping the input1 tensor. Only the following values are supported:
- * rhs_info.n0: 2,3,4,8,16
- * rhs_info.k0: 2,3,4,8,16
+ * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
+ * rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
* rhs_info.transpose: true
* @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
*
@@ -141,10 +178,10 @@ private:
ICLTensor *_output;
bool _slide_matrix_b;
bool _reinterpret_output_as_3d;
- unsigned int _k;
bool _use_dummy_work_items;
bool _add_bias;
bool _broadcast_bias;
+ bool _export_to_cl_image;
};
} // namespace arm_compute
#endif /*ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H*/ \ No newline at end of file
diff --git a/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h b/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h
index 0e6352bdbb..59a4aaa912 100644
--- a/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h
@@ -48,12 +48,23 @@ public:
CLGEMMReshapeRHSMatrixKernel &operator=(CLGEMMReshapeRHSMatrixKernel &&) = default;
/** Initialise the kernel's input and output.
*
+ * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor,
+ * required to create a OpenCL image object from buffer in @ref CLGEMMMatrixMultiplyReshapedKernel and in @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
+ * Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required:
+ * -# rhs_info.n0 can only be 4, 8 and 16
+ * -# rhs_info.k0 can only be 4, 8 and 16
+ * -# Data type can only be F32
+ * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+ * -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+ * -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+ * -# The output tensor should be only consumed by @ref CLGEMMMatrixMultiplyReshapedKernel or @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
+ *
* @param[in] input Input tensor. Data types supported: All
* @param[out] output Output tensor. Data type supported: same as @p input
* @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary
* information to reshape the input tensor. Only the following values are supported:
- * rhs_info.n0: 2,3,4,8,16
- * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false)
+ * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
+ * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false), (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
* rhs_info.h0: greater than 0
* rhs_info.transpose: true, false
* rhs_info.interleave: true, false
@@ -61,13 +72,24 @@ public:
void configure(const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info);
/** Initialise the kernel's input and output.
*
+ * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor,
+ * required to create a OpenCL image object from buffer in @ref CLGEMMMatrixMultiplyReshapedKernel and in @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
+ * Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required:
+ * -# rhs_info.n0 can only be 4, 8 and 16
+ * -# rhs_info.k0 can only be 4, 8 and 16
+ * -# Data type can only be F32
+ * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+ * -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+ * -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+ * -# The output tensor should be only consumed by @ref CLGEMMMatrixMultiplyReshapedKernel or @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
+ *
* @param[in] compile_context The compile context to be used.
* @param[in] input Input tensor. Data types supported: All
* @param[out] output Output tensor. Data type supported: same as @p input
* @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary
* information to reshape the input tensor. Only the following values are supported:
- * rhs_info.n0: 2,3,4,8,16
- * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false)
+ * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
+ * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false), (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
* rhs_info.h0: greater than 0
* rhs_info.transpose: true, false
* rhs_info.interleave: true, false
@@ -75,12 +97,23 @@ public:
void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info);
/** Static function to check if given info will lead to a valid configuration of @ref CLGEMMReshapeRHSMatrixKernel
*
+ * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor,
+ * required to create a OpenCL image object from buffer in @ref CLGEMMMatrixMultiplyReshapedKernel and in @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
+ * Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required:
+ * -# rhs_info.n0 can only be 4, 8 and 16
+ * -# rhs_info.k0 can only be 4, 8 and 16
+ * -# Data type can only be F32
+ * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+ * -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+ * -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+ * -# The output tensor should be only consumed by @ref CLGEMMMatrixMultiplyReshapedKernel or @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
+ *
* @param[in] input Input tensor info. Data types supported: All
* @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
* @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary
* information to reshape the input tensor. Only the following values are supported:
- * rhs_info.n0: 2,3,4,8,16
- * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false)
+ * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
+ * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false),(only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
* rhs_info.h0: greater than 0
* rhs_info.transpose: true, false
* rhs_info.interleave: true, false
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 4e73edba4b..d151496537 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1912,8 +1912,8 @@ struct GEMMLHSMatrixInfo
struct GEMMRHSMatrixInfo
{
GEMMRHSMatrixInfo() = default;
- GEMMRHSMatrixInfo(unsigned int n, unsigned int k, unsigned int h, bool trans, bool inter)
- : n0(n), k0(k), h0(h), transpose(trans), interleave(inter)
+ GEMMRHSMatrixInfo(unsigned int n, unsigned int k, unsigned int h, bool trans, bool inter, bool export_to_cl_img)
+ : n0(n), k0(k), h0(h), transpose(trans), interleave(inter), export_to_cl_image(export_to_cl_img)
{
}
unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */