From 781cba7f33e056b1ca470ab34eb478177768eaf4 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Fri, 19 Jun 2020 16:56:57 +0100 Subject: COMPMID-3322: Add cl_image support for GEMMReshapedOnlyRHS NT COMPMID-3323: Add cl_image support for GEMMReshapedOnlyRHS T - Added support for cl_image in CLGEMMMatrixMultiplyReshapedInlyRHSKernel (both NT and T kernels) - Extended the tests for the validating rhs_info.export_to_cl_image = true - Updated doxygen documentation in CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h Change-Id: If253794323aac072d84a4d8680b9a2339ab7ad92 Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3437 Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio Reviewed-by: Georgios Pinitas Comments-Addressed: Arm Jenkins --- .../CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp | 101 ++++++++++++++------- 1 file changed, 70 insertions(+), 31 deletions(-) (limited to 'src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp') diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp index 8e194d5139..deeb491fd7 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp @@ -32,6 +32,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/float_ops.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/CL/CLUtils.h" #include "support/StringSupport.h" #include @@ -65,6 +66,23 @@ Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, "Bias addition only supported with broadcast mode in case the input or output has to be reinterpreted as 3D"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported"); + if(rhs_info.export_to_cl_image) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.n0 == 2) || (rhs_info.n0 == 3), "Export to cl_image only supported with n0 = 4, 8 or 16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.k0 == 2) || (rhs_info.k0 == 3), "Export to cl_image only supported with k0 = 4, 8 or 16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->data_type() != DataType::F32, "Export to cl_image only supported with F32 data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, "Impossible to retrieve the cl_image pitch alignment"); + + // Check the width and height of the output tensor. + // Since we cannot create a 3d image from a buffer, the third dimension is collapsed with the second dimension + size_t max_image_w = CLKernelLibrary::get().get_device().getInfo(); + size_t max_image_h = CLKernelLibrary::get().get_device().getInfo(); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->tensor_shape()[0] > max_image_w * 4, "Not supported width for cl_image"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->tensor_shape()[1] * input1->tensor_shape()[2] > max_image_h, "Not supported height for cl_image"); + } + const unsigned int m = gemm_info.m; const unsigned int n = gemm_info.n; const unsigned int k = gemm_info.k; @@ -204,7 +222,7 @@ std::pair validate_and_configure_window(ITensorInfo *input0, ITe CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMMatrixMultiplyReshapedOnlyRHSKernel() : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false), - _add_bias(false), _broadcast_bias(false) + _add_bias(false), _broadcast_bias(false), _export_to_cl_image(false) { } @@ -234,6 +252,7 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); _add_bias = _input2 != nullptr; _broadcast_bias = gemm_info.broadcast_bias; + _export_to_cl_image = rhs_info.export_to_cl_image; // In case both input and output have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. @@ -276,6 +295,8 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2))); build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); + build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT"); + build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(input1->info()->dimension(1))); build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m)); build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n)); build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k)); @@ -289,6 +310,7 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext std::string kernel_name("gemm_mm_reshaped_only_rhs_"); kernel_name += rhs_info.transpose ? "t" : "nt"; + kernel_name += rhs_info.export_to_cl_image ? "_texture" : ""; // Create kernel _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); @@ -358,36 +380,17 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::Co slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - if(_reinterpret_input_as_3d) - { - // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor - unsigned int idx0; - if(_add_bias) - { - idx0 = 4 * num_arguments_per_2D_tensor() + 4; - } - else - { - idx0 = 3 * num_arguments_per_2D_tensor() + 3; - } - const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); - } + const unsigned int total_cross_plane_pad_lhs = _input0->info()->padding().top + _input0->info()->padding().bottom; + const unsigned int total_cross_plane_pad_out = _output->info()->padding().top + _output->info()->padding().bottom; + + cl::Image2D input1_image2d; - if(_reinterpret_output_as_3d) + if(_export_to_cl_image) { - // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor - unsigned int idx0; - if(_add_bias) - { - idx0 = 4 * num_arguments_per_2D_tensor() + 4 + (_reinterpret_input_as_3d ? 1 : 0); - } - else - { - idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); - } - const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom; - _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); + const TensorShape shape2d(_input1->info()->dimension(0) / 4, _input1->info()->dimension(1) * _input1->info()->dimension(2)); + const size_t image_row_pitch = _input1->info()->strides_in_bytes()[1]; + + input1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input1->cl_buffer(), shape2d, CL_FLOAT, image_row_pitch); } do @@ -401,17 +404,53 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::run(const Window &window, cl::Co } unsigned int idx = 0; + + // LHS buffer add_2D_tensor_argument(idx, _input0, slice); - add_2D_tensor_argument(idx, _input1, slice_b); - add_2D_tensor_argument_if((_add_bias), idx, _input2, slice); + + // RHS buffer or RHS OpenCL image (_export_to_cl_image == true) + if(_export_to_cl_image) + { + _kernel.setArg(idx++, input1_image2d); + } + else + { + add_2D_tensor_argument(idx, _input1, slice_b); + } + + // Bias buffer (_add_bias == true) + add_2D_tensor_argument_if(_add_bias, idx, _input2, slice); + + // Output buffer add_2D_tensor_argument(idx, _output, slice); + + // LHS stride_z _kernel.setArg(idx++, static_cast(_input0->info()->strides_in_bytes()[2])); + + // RHS stride_z (not used if _export_to_cl_image == true) _kernel.setArg(idx++, static_cast(_input1->info()->strides_in_bytes()[2])); + + // Bias stride_z (if _add_bias == true) if(_add_bias) { _kernel.setArg(idx++, static_cast(_input2->info()->strides_in_bytes()[2])); } + + // Output stride_z _kernel.setArg(idx++, static_cast(_output->info()->strides_in_bytes()[2])); + + // Cross-plan padding (if _reinterpret_input_as_3d = true) + if(_reinterpret_input_as_3d) + { + _kernel.setArg(idx++, static_cast(total_cross_plane_pad_lhs)); + } + + // Cross-plan padding (if _reinterpret_output_as_3d = true) + if(_reinterpret_output_as_3d) + { + _kernel.setArg(idx++, static_cast(total_cross_plane_pad_out)); + } + enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); } while(window.slide_window_slice_3D(slice)); -- cgit v1.2.1