From a98dee2da0aef1c53a31045b0c681fb0abc8f8ba Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Tue, 2 Jun 2020 12:12:35 +0100 Subject: COMPMID-3319: Force padding requirement in CLGEMMReshapeRHSMatrixKernel Added padding requirement in CLGEMMReshapeRHSMatrixKernel in order to create 2d image from a cl_buffer. Test extended in order to validate the padding requirement Change-Id: I36bcaf3e9299ee186602b4e3456851cc8cda6ce6 Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3292 Reviewed-by: Georgios Pinitas Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- arm_compute/core/CL/CLHelpers.h | 16 ++++++++ arm_compute/core/CL/OpenCL.h | 1 + src/core/CL/CLHelpers.cpp | 21 +++++++++++ src/core/CL/OpenCL.cpp | 25 +++++++++++++ .../CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp | 30 +++++++++++++++ tests/validation/CL/GEMMReshapeRHSMatrix.cpp | 43 ++++++++++++++++++++++ 6 files changed, 136 insertions(+) diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h index c5db66c664..fc3f4d5db0 100644 --- a/arm_compute/core/CL/CLHelpers.h +++ b/arm_compute/core/CL/CLHelpers.h @@ -121,6 +121,14 @@ GPUTarget get_target_from_device(const cl::Device &device); */ CLVersion get_cl_version(const cl::Device &device); +/** Helper function to get the cl_image pitch alignment in pixels + * + * @param[in] device A CL device + * + * @return the cl_image pitch alignment in pixels. If an error occurs, the function will return 0 + */ +size_t get_cl_image_pitch_alignment(const cl::Device &device); + /** Helper function to check whether a given extension is supported * * @param[in] device A CL device @@ -188,6 +196,14 @@ size_t preferred_vector_width(const cl::Device &device, DataType dt); */ bool preferred_dummy_work_items_support(const cl::Device &device); +/** Helper function to check whether the cl_khr_image2d_from_buffer extension is supported + * + * @param[in] device A CL device + * + * @return True if the extension is supported + */ +bool image2d_from_buffer_supported(const cl::Device &device); + /** Creates an opencl kernel * * @param[in] ctx A context to be used to create the opencl kernel. diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h index b87cc67b72..72cbb3d2b2 100644 --- a/arm_compute/core/CL/OpenCL.h +++ b/arm_compute/core/CL/OpenCL.h @@ -134,6 +134,7 @@ public: DECLARE_FUNCTION_PTR(clEnqueueSVMUnmap); DECLARE_FUNCTION_PTR(clEnqueueMarker); DECLARE_FUNCTION_PTR(clWaitForEvents); + DECLARE_FUNCTION_PTR(clCreateImage); // Third-party extensions DECLARE_FUNCTION_PTR(clImportMemoryARM); diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp index 15f45d52d9..44695d00fe 100644 --- a/src/core/CL/CLHelpers.cpp +++ b/src/core/CL/CLHelpers.cpp @@ -370,6 +370,27 @@ bool preferred_dummy_work_items_support(const cl::Device &device) return true; } +bool image2d_from_buffer_supported(const cl::Device &device) +{ + return device_supports_extension(device, "cl_khr_image2d_from_buffer"); +} + +size_t get_cl_image_pitch_alignment(const cl::Device &device) +{ + cl_uint pixel_aligment = 0; + + cl_int err = clGetDeviceInfo(device(), CL_DEVICE_IMAGE_PITCH_ALIGNMENT, sizeof(cl_uint), &pixel_aligment, nullptr); + + if(err == CL_SUCCESS) + { + return pixel_aligment; + } + else + { + return 0; + } +} + cl::Kernel create_opencl_kernel(CLCoreRuntimeContext *ctx, const std::string &kernel_name, const CLBuildOptions &build_opts) { if(ctx && ctx->kernel_library()) diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp index 9a3e344f1f..809f21b89e 100644 --- a/src/core/CL/OpenCL.cpp +++ b/src/core/CL/OpenCL.cpp @@ -133,6 +133,7 @@ bool CLSymbols::load(const std::string &library) LOAD_FUNCTION_PTR(clEnqueueSVMUnmap, handle); LOAD_FUNCTION_PTR(clEnqueueMarker, handle); LOAD_FUNCTION_PTR(clWaitForEvents, handle); + LOAD_FUNCTION_PTR(clCreateImage, handle); // Third-party extensions LOAD_FUNCTION_PTR(clImportMemoryARM, handle); @@ -937,6 +938,30 @@ clGetEventProfilingInfo(cl_event event, } } +cl_mem +clCreateImage(cl_context context, + cl_mem_flags flags, + const cl_image_format *image_format, + const cl_image_desc *image_desc, + void *host_ptr, + cl_int *errcode_ret) +{ + arm_compute::CLSymbols::get().load_default(); + auto func = arm_compute::CLSymbols::get().clCreateImage_ptr; + if(func != nullptr) + { + return func(context, flags, image_format, image_desc, host_ptr, errcode_ret); + } + else + { + if(errcode_ret != nullptr) + { + *errcode_ret = CL_OUT_OF_RESOURCES; + } + return nullptr; + } +} + cl_mem clImportMemoryARM(cl_context context, cl_mem_flags flags, diff --git a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp index 4217932097..43e7b92c6a 100644 --- a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp +++ b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp @@ -54,6 +54,23 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16); ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16); ARM_COMPUTE_RETURN_ERROR_ON((rhs_info.k0 == 1) && (rhs_info.transpose)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image && ((rhs_info.n0 != 4) || input->data_type() != DataType::F32), "Export to cl_image only supported with n0 = 4 and F32 data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image + && !image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image && (get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0), "Impossible to retrieve the cl_image pitch alignment"); + + if(rhs_info.export_to_cl_image) + { + TensorShape output_shape = compute_rhs_reshaped_shape(*input, rhs_info); + + // Check the width and height of the output tensor. + // Since we cannot create a 3d image from a buffer, the third dimension is collapsed with the second dimension + size_t max_image_w = CLKernelLibrary::get().get_device().getInfo(); + size_t max_image_h = CLKernelLibrary::get().get_device().getInfo(); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape[0] > max_image_w * 4, "Not supported width for cl_image"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape[1] * output_shape[2] > max_image_h, "Not supported height for cl_image"); + } ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); @@ -86,6 +103,19 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen window_changed = update_window_and_padding(win, input_access); output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape())); + if(rhs_info.export_to_cl_image) + { + constexpr unsigned int num_floats_per_pixel = 4; + + const unsigned int stride_y_in_elements = output->strides_in_bytes()[1] / output->element_size(); + const unsigned int pixel_aligment = get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()); + const unsigned int row_pitch_alignment = pixel_aligment * num_floats_per_pixel; + const unsigned int round_up_width = ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment; + const unsigned int padding = round_up_width - stride_y_in_elements; + + output->extend_padding(PaddingSize(0, padding, 0, 0)); + } + // Collapse along the Z direction // This collapse needs to be here in order to tune the Z dimension of LWS Window collapsed = win.collapse(win, Window::DimZ); diff --git a/tests/validation/CL/GEMMReshapeRHSMatrix.cpp b/tests/validation/CL/GEMMReshapeRHSMatrix.cpp index 55688cf160..aa6667666c 100644 --- a/tests/validation/CL/GEMMReshapeRHSMatrix.cpp +++ b/tests/validation/CL/GEMMReshapeRHSMatrix.cpp @@ -123,6 +123,49 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip( bool has_error = bool(CLGEMMReshapeRHSMatrixKernel::validate(&input_info.clone()->set_is_resizable(false), (output_info.total_size() == 0) ? nullptr : &output_info.clone()->set_is_resizable(false), rhs_info)); ARM_COMPUTE_EXPECT(has_error == expected, framework::LogLevel::ERRORS); } + +DATA_TEST_CASE(ValidatePadding, framework::DatasetMode::ALL, combine(combine(combine( + framework::dataset::make("InputShape", { TensorShape(32U, 16U, 1U), + TensorShape(32U, 16U, 2U) + }), + framework::dataset::make("N0",{ 4 })), + framework::dataset::make("K0",{ 2, 4, 8 })), + framework::dataset::make("H0",{ 1, 2, 4 })), + input_shape, n0, k0, h0) +{ + CLTensor input; + CLTensor output; + + input.info()->init(input_shape, 1, DataType::F32); + + unsigned int padding = 0; + + GEMMRHSMatrixInfo rhs_info; + rhs_info.n0 = n0; + rhs_info.k0 = k0; + rhs_info.h0 = h0; + rhs_info.transpose = true; + rhs_info.interleave = true; + rhs_info.export_to_cl_image = image2d_from_buffer_supported(CLKernelLibrary::get().get_device()) && (get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) != 0); + + if(rhs_info.export_to_cl_image) + { + TensorShape output_shape = compute_rhs_reshaped_shape(*input.info(), rhs_info); + constexpr unsigned int num_floats_per_pixel = 4; + + const unsigned int pixel_aligment = get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()); + const unsigned int row_pitch_alignment = pixel_aligment * num_floats_per_pixel; + const unsigned int round_up_width = ((output_shape[0] + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment; + + padding = round_up_width - output_shape[0]; + } + + CLGEMMReshapeRHSMatrixKernel kernel; + + kernel.configure(&input, &output, rhs_info); + + ARM_COMPUTE_EXPECT((output.info()->padding().right == padding), framework::LogLevel::ERRORS); +} // clang-format on // *INDENT-ON* -- cgit v1.2.1