From 856f66e6c61b77d03f754cd0fa8439891f0e4aca Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Thu, 22 Apr 2021 21:13:21 +0100 Subject: Port CLGEMM to memory injecting interface Moves the following kernels: - CLGEMMMatrixMultiplyKernel - CLGEMMMatrixMultiplyNativeKernel - CLGEMMMatrixMultipluReshapedKernel - CLGEMMMatrixMultiplyReshapedOnlyRHSKernel Moves the following functions - CLGEMM Introduces facilities to easy handling of auxiliary temporary buffers under then new run interface. Such are: - CLAuxTensorHandler: That allows wrapping of workspace buffers memory to CLBuffer objects - Ability to inject TensorInfo to allocator without transferring ownership. This reduce the copy overhead if needed. Resolves: COMPMID-4188 Signed-off-by: Georgios Pinitas Change-Id: I7055435d831b05b749b26302082e4ac45f26dfb0 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5498 Tested-by: Arm Jenkins Reviewed-by: Michalis Spyrou Comments-Addressed: Arm Jenkins --- .../gpu/cl/kernels/ClDirectConvolutionKernel.cpp | 6 +- .../gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp | 533 +++++++++++++++++++ .../gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h | 88 ++++ .../kernels/ClGemmMatrixMultiplyNativeKernel.cpp | 411 +++++++++++++++ .../cl/kernels/ClGemmMatrixMultiplyNativeKernel.h | 88 ++++ .../kernels/ClGemmMatrixMultiplyReshapedKernel.cpp | 416 +++++++++++++++ .../kernels/ClGemmMatrixMultiplyReshapedKernel.h | 113 ++++ .../ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp | 438 ++++++++++++++++ .../ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h | 104 ++++ .../cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp | 219 ++++++++ .../gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h | 78 +++ .../cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp | 170 ++++++ .../gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h | 84 +++ src/core/gpu/cl/kernels/gemm/ClGemmHelpers.cpp | 116 +++++ src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h | 95 ++++ src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h | 123 +++++ .../native/ClGemmDefaultConfigNativeBifrost.cpp | 246 +++++++++ .../gemm/native/ClGemmDefaultConfigNativeBifrost.h | 62 +++ .../native/ClGemmDefaultConfigNativeMidgard.cpp | 73 +++ .../gemm/native/ClGemmDefaultConfigNativeMidgard.h | 57 +++ .../native/ClGemmDefaultConfigNativeValhall.cpp | 168 ++++++ .../gemm/native/ClGemmDefaultConfigNativeValhall.h | 59 +++ .../kernels/gemm/native/ClGemmNativeKernelConfig.h | 71 +++ .../ClGemmDefaultConfigReshapedBifrost.cpp | 356 +++++++++++++ .../reshaped/ClGemmDefaultConfigReshapedBifrost.h | 64 +++ .../ClGemmDefaultConfigReshapedValhall.cpp | 538 +++++++++++++++++++ .../reshaped/ClGemmDefaultConfigReshapedValhall.h | 61 +++ .../gemm/reshaped/ClGemmReshapedKernelConfig.h | 69 +++ .../ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp | 518 +++++++++++++++++++ .../ClGemmDefaultConfigReshapedRhsOnlyBifrost.h | 67 +++ .../ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp | 570 +++++++++++++++++++++ .../ClGemmDefaultConfigReshapedRhsOnlyValhall.h | 61 +++ .../ClGemmDefaultReshapedRhsOnlyBifrost.cpp | 518 +++++++++++++++++++ .../ClGemmDefaultReshapedRhsOnlyValhall.cpp | 570 +++++++++++++++++++++ .../ClGemmReshapedOnlyRhsKernelConfig.h | 69 +++ 35 files changed, 7276 insertions(+), 3 deletions(-) create mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp create mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h create mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp create mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h create mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp create mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h create mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp create mode 100644 src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h create mode 100644 src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp create mode 100644 src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h create mode 100644 src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp create mode 100644 src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h create mode 100644 src/core/gpu/cl/kernels/gemm/ClGemmHelpers.cpp create mode 100644 src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h create mode 100644 src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h create mode 100644 src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp create mode 100644 src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h create mode 100644 src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp create mode 100644 src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h create mode 100644 src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp create mode 100644 src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h create mode 100644 src/core/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h create mode 100644 src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp create mode 100644 src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h create mode 100644 src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp create mode 100644 src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h create mode 100644 src/core/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h create mode 100644 src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp create mode 100644 src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h create mode 100644 src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp create mode 100644 src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h create mode 100644 src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultReshapedRhsOnlyBifrost.cpp create mode 100644 src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultReshapedRhsOnlyValhall.cpp create mode 100644 src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h (limited to 'src/core/gpu') diff --git a/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp b/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp index 18d648d2f2..0a5101f564 100644 --- a/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp +++ b/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp @@ -35,7 +35,7 @@ #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLUtils.h" #include "src/core/CL/CLValidate.h" -#include "src/core/CL/gemm/CLGEMMHelpers.h" +#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" @@ -416,7 +416,7 @@ void ClDirectConvolutionKernel::configure(const CLCompileContext &compile_contex const unsigned int n0 = win_config.second.x().step(); const unsigned int m0 = win_config.second.y().step(); - const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type)? 16u : 8u, src->dimension(channel_idx)); + const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src->dimension(channel_idx)); const unsigned int partial_store_n0 = dst->dimension(channel_idx) % n0; const unsigned int pad_left = conv_info.pad_left(); const unsigned int pad_top = conv_info.pad_top(); @@ -425,7 +425,7 @@ void ClDirectConvolutionKernel::configure(const CLCompileContext &compile_contex // Update the padding for the weights tensor if we can export to cl_image if(export_to_cl_image) { - arm_compute::cl_gemm::update_padding_for_cl_image(weights); + gemm::update_padding_for_cl_image(weights); } if(biases != nullptr) diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp new file mode 100644 index 0000000000..817a105b14 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp @@ -0,0 +1,533 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/utils/helpers/float_ops.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +using ElementsProcessed = Steps; + +inline Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float beta, + bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((fp_mixed_precision && (src0->data_type() != DataType::F16)), "Mixed precision floating point is supported only for F16 data"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 2 && reshape_info.reinterpret_input_as_3d(), "The src1 tensor cannot have more than 2 dimensions if src0 has to be reinterpreted as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((reshape_info.reinterpret_input_as_3d() || reshape_info.depth_output_gemm3d() != 0) && (src2 != nullptr) + && (!reshape_info.broadcast_bias()), + "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); + + if(!is_interleaved_transposed) + { + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != src1->dimension(1)); + + if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) + { + const unsigned int m = reshape_info.reinterpret_input_as_3d() ? src0->dimension(1) * src0->dimension(2) : src0->dimension(1); + const unsigned int n = src1->dimension(0); + const unsigned int src2_dim0 = src2->dimension(0); + const unsigned int src2_dim1 = src2->dimension(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1); + if(reshape_info.broadcast_bias()) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix"); + } + } + } + else + { + GEMMRHSMatrixInfo rhs_info; + GEMMLHSMatrixInfo lhs_info; + const auto m = static_cast(reshape_info.m()); + const auto n = static_cast(reshape_info.n()); + const int k = reshape_info.k(); + const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width(); + const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height(); + rhs_info.n0 = max_cl_vector_width / src1->element_size(); + rhs_info.k0 = 1; + rhs_info.h0 = mult_transpose1xW_width; + rhs_info.interleave = false; + rhs_info.transpose = false; + lhs_info.m0 = 4; + lhs_info.k0 = 4; + lhs_info.v0 = mult_interleave4x4_height; + lhs_info.interleave = true; + lhs_info.transpose = true; + + TensorShape tensor_shape0{ src0->tensor_shape() }; + tensor_shape0.set(0, k); + tensor_shape0.set(1, m); + + TensorShape tensor_shape1{ src1->tensor_shape() }; + tensor_shape1.set(0, n); + tensor_shape1.set(1, k); + + const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0); + const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); + + const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info)); + const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); + + if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) + { + const unsigned int src2_dim0 = src2->dimension(0); + const unsigned int src2_dim1 = src2->dimension(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1); + if(reshape_info.broadcast_bias()) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix"); + } + } + } + + if(dst->total_size() != 0) + { + const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, is_interleaved_transposed, reshape_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); + } + + return Status{}; +} + +inline std::pair validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, + float beta, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, + ElementsProcessed &num_elements_processed) +{ + ARM_COMPUTE_UNUSED(beta); + bool window_changed = false; + Window win{}; + Window win_out{}; + + const DataType data_type = src0->data_type(); + unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; + unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; + bool reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d(); + bool reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0); + + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. + if(reinterpret_input_as_3d == reinterpret_output_as_3d) + { + reinterpret_input_as_3d = false; + reinterpret_output_as_3d = false; + } + + // dst tensor auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, is_interleaved_transposed, reshape_info))); + + TensorInfo tmp_info(*dst); + + if(reinterpret_output_as_3d) + { + // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, + // the window needs to be constructed on the 2D collapsed version of the tensor + TensorShape tmp_shape(dst->tensor_shape()); + tmp_shape.collapse(2U, 1U); + tmp_info.set_tensor_shape(tmp_shape); + } + + if(is_interleaved_transposed) + { + // reinterpret_input_as_3d is not supported if is_interleaved_transposed is set + ARM_COMPUTE_ERROR_ON(reshape_info.reinterpret_input_as_3d()); + + // Configure kernel window + num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type); + num_elems_processed_per_iteration_y = 4; + + win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + if(src2 != nullptr) + { + const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; + + const int bias_processed_per_iteration_y = reshape_info.broadcast_bias() ? 1 : num_elems_processed_per_iteration_y; + + AccessWindowStatic src2_access(src2, 0, 0, + ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x), + ceil_to_multiple(src2->dimension(1), bias_processed_per_iteration_y)); + + window_changed = update_window_and_padding(win, src2_access); // window used by the execute_window_loop + } + } + else // The input tensors have not been reshaped + { + // Special case for 1xN, 2xN, 3xN and 4xN src0 tensor. num_elems_processed_per_iteration_x is set up for the default case. + num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type); + num_elems_processed_per_iteration_y = std::min(static_cast(dst->dimension(1)), 4); + + // Create kernels according to the architecture, data type and input size. + GPUTarget arch_target = get_arch_from_target(gpu_target); + if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32) + { + num_elems_processed_per_iteration_x = (src1->dimension(0) <= 1000 && src0->num_dimensions() == 1) ? 2 : 4; + } + + // Configure window + win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + AccessWindowStatic src0_access(src0, 0, 0, src0->dimension(0), src0->dimension(1)); + AccessWindowStatic src1_access(src1, 0, 0, ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), src1->dimension(1)); + AccessWindowStatic dst_access(dst, 0, 0, + dst->dimension(0), + dst->dimension(1)); + + if(src2 != nullptr) + { + const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; + + AccessWindowStatic src2_access(src2, 0, 0, + ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x), + src2->dimension(1)); + + window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop + update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor + } + else + { + window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop + update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor + } + } + + // Collapse along the Z direction + // This collapse needs to be here in order to tune the Z dimension of LWS + Window collapsed = win; + const unsigned int dimension_to_collapse = std::min(static_cast(dst->num_dimensions()), 2u); + collapsed = win.collapse(win, dimension_to_collapse); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, collapsed); +} +} // namespace + +void ClGemmMatrixMultiplyKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, + float beta, + bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision, const ActivationLayerInfo &activation_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, beta, + is_interleaved_transposed, reshape_info, fp_mixed_precision)); + + auto padding_info = is_interleaved_transposed ? get_padding_info({ src0, src1, dst }) : get_padding_info({ src0, dst }); + + _reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d(); + _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0); + _add_bias = src2 != nullptr; + + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. + if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) + { + _reinterpret_input_as_3d = false; + _reinterpret_output_as_3d = false; + } + + // Check if we need to slide the matrix B + const unsigned int num_dimensions_src0 = _reinterpret_input_as_3d ? src0->num_dimensions() - 1 : src0->num_dimensions(); + + _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0); + + const DataType data_type = src0->data_type(); + + // Get target architecture + GPUTarget gpu_target = get_target(); + + ElementsProcessed num_elements_processed{}; + + // Configure kernel window + auto win_config = validate_and_configure_window(src0, src1, src2, dst, beta, is_interleaved_transposed, reshape_info, + gpu_target, num_elements_processed); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, both will be turned off (false) + // in which case we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. + // This means that the actual m used by the kernel is given by dst->dimension(1) + const unsigned int internal_m = _reinterpret_output_as_3d ? dst->dimension(1) * dst->dimension(2) : dst->dimension(1); + const unsigned int n = dst->dimension(0); + + const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(1) : src0->dimension(1); + const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(2) : src0->dimension(2); + + const unsigned int m0 = num_elements_processed.y(); + const unsigned int n0 = num_elements_processed.x(); + + // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. + const unsigned int partial_store_m0 = internal_m % m0; + const unsigned int partial_store_n0 = n % n0; + + // Create build options + CLBuildOptions build_opts; + + build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); + build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); + build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); + build_opts.add_option_if(reshape_info.broadcast_bias(), "-DBROADCAST_BIAS"); + build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); + build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); + build_opts.add_option_if(activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(activation_info.activation()))); + build_opts.add_option_if(activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(activation_info.a())); + build_opts.add_option_if(activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(activation_info.b())); + build_opts.add_option("-DIN1_DIM_X=" + support::cpp11::to_string(src1->dimension(0))); + + const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST; + + std::string kernel_name; + if(is_interleaved_transposed) + { + const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width(); + const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height(); + + build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m)); + build_opts.add_option("-DN=" + support::cpp11::to_string(n)); + build_opts.add_option("-DK=" + support::cpp11::to_string(src1->dimension(0) / (n0 * mult_transpose1xW_width))); + build_opts.add_option("-DH0=" + support::cpp11::to_string(mult_transpose1xW_width)); + build_opts.add_option("-DV0=" + support::cpp11::to_string(mult_interleave4x4_height)); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); + build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); + + if(is_data_type_float(data_type) && is_bifrost) + { + kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)) + "_bifrost"; + } + else + { + kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)); + if(fp_mixed_precision && data_type == DataType::F16) + { + // currently wider accumulator is only supported for fp16 kernels. + kernel_name += "_acc32"; + } + } + } + else // The input tensors have not been reshaped + { + build_opts.add_option("-DN=" + support::cpp11::to_string(n)); + build_opts.add_option("-DK=" + support::cpp11::to_string(src0->dimension(0))); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); + build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); + build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); + + // Create kernels according to the architecture, data type and input size. + if(is_data_type_float(data_type) && is_bifrost) + { + kernel_name = "gemm_mm_floating_point"; + + if(src0->num_dimensions() != 1) + { + kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost"; + if(fp_mixed_precision && data_type == DataType::F16) + { + // currently wider accumulator is only supported for fp16 kernels. + kernel_name += "_acc32"; + } + } + else if(src1->dimension(0) <= 1000 && data_type == DataType::F32) + { + // The first kernel is optimized for the case of 1000 or less dst elements (e.g. FC8 of AlexNet and VGG-16, and + // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 dst elements (e.g. + // FC6 and FC7 of AlexNet and VGG-16). + kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000"; + } + + // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels + // via exhaustive autotuning over a range of representative layer configurations. + set_lws_hint(cl::NDRange(4)); + } + else // (MIDGARD and F32) or (F16) + { + kernel_name = "gemm_mm_floating_point"; + } + } + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Set config_id for enabling LWS tuning + _config_id = "gemm_"; + _config_id += (is_interleaved_transposed ? "reshaped_" : ""); + _config_id += (_add_bias ? "add_bias_" : ""); + _config_id += (reshape_info.broadcast_bias() ? "broadcast_bias_" : ""); + _config_id += (fp_mixed_precision ? "fp_mixed_" : ""); + _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); + _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); + _config_id += lower_string(string_from_data_type(src0->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(3)); + _config_id += "_"; + _config_id += (is_interleaved_transposed ? support::cpp11::to_string(src1->dimension(0)) : support::cpp11::to_string(src1->dimension(1))); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, + bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision, const ActivationLayerInfo &activation_info) +{ + // Note: num_elements_processed will be set in validate_and_configure_window() + ElementsProcessed num_elements_processed{}; + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_UNUSED(activation_info); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), + src1->clone().get(), + (src2 != nullptr) ? src2->clone().get() : nullptr, + dst->clone().get(), + beta, + is_interleaved_transposed, + reshape_info, + gpu_target, + num_elements_processed) + .first); + + return Status{}; +} + +void ClGemmMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto src2 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr); + + if(src1->info()->num_dimensions() < 3) + { + // The stride_z for matrix B must be zero if we do not slice + ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); + } + + Window slice = window.first_slice_window_3D(); + Window slice_matrix_b = slice; + + slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); + slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + const unsigned int num_arguments_bias = _add_bias ? num_arguments_per_2D_tensor() + 1 : 0; + + if(_reinterpret_input_as_3d) + { + // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor + const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + num_arguments_bias; + const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom; + _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); + } + + if(_reinterpret_output_as_3d) + { + // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor + const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0) + num_arguments_bias; + const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; + _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); + } + + do + { + Window slice_b = slice; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the matrix multiplication is used to perform a convolution operation + if(!_slide_matrix_b) + { + slice_b = slice_matrix_b; + } + + unsigned int idx = 0; + add_2D_tensor_argument(idx, src0, slice); + add_2D_tensor_argument(idx, src1, slice_b); + if(_add_bias) + { + add_2D_tensor_argument(idx, src2, slice); + } + add_2D_tensor_argument(idx, dst, slice); + _kernel.setArg(idx++, static_cast(src0->info()->strides_in_bytes()[2])); + _kernel.setArg(idx++, static_cast(src1->info()->strides_in_bytes()[2])); + if(_add_bias) + { + _kernel.setArg(idx++, static_cast(src2->info()->strides_in_bytes()[2])); + } + _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[2])); + enqueue(queue, *this, slice, lws_hint()); + } + while(window.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h new file mode 100644 index 0000000000..c1601335ee --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_KERNEL_H +#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to multiply two input matrices "A" and "B" and add a martix "C" if provided. All elements of the output matrix will be multiplied by alpha. In case matrix C is passed, it will be added to the previous result. + * For the matrix C, the broadcast addition is supported if the flag "broadcast_bias" is set in the GEMMReshapeInfo object + * + * @note If the input tensors @p src0 and @p src1 have been reshaped respectively with @ref ClGemmReshapeLhsMatrixKernel" and @ref ClGemmReshapeRhsMatrixKernel, + * the flag @p is_interleaved_transposed must be set to true + * + * @attention @p src1 tensor must have at least 2 dimensions (matrix) + */ +class ClGemmMatrixMultiplyKernel : public IClKernel +{ +public: + ClGemmMatrixMultiplyKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyKernel); + /** Initialise the kernel's input, output and alpha + * + * @param[in] compile_context The compile context to be used. + * @param[in] src0 Input tensor containing the Matrix A. Data types supported: F16/F32 + * @param[in] src1 Input tensor containing the Matrix B. Data type supported: same as @p src0 + * @param[in] src2 Input tensor containing the Matrix C (bias). Can be nullptr. Data type supported: same as @p src0 + * @param[out] dst Output tensor to store the result of matrix multiplication. Data type supported: same as @p src0 + * @param[in] alpha Weight of the matrix product + * @param[in] beta (Optional) Weight of vector C. Default value is 0. Only beta = 1 is currently supported. + * @param[in] is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref ClGemmReshapeLhsMatrixKernel and @ref ClGemmReshapeRhsMatrixKernel + * @param[in] reshape_info (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped + * @param[in] fp_mixed_precision (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy + * @param[in] activation_info (Optional) Activation to apply after the matrix multiplication + * + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta = 0.f, + bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo(), bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmMatrixMultiplyKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, + bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +public: + bool _slide_matrix_b{ true }; + bool _reinterpret_input_as_3d{ false }; + bool _reinterpret_output_as_3d{ false }; + bool _add_bias{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp new file mode 100644 index 0000000000..97d64c433c --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/utils/helpers/float_ops.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +using ElementsProcessed = Steps; + +Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info) +{ + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) + && (!gemm_info.broadcast_bias), + "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for GEMM native"); + + const unsigned int m = gemm_info.m; + const unsigned int n = gemm_info.n; + const unsigned int k = gemm_info.k; + + ARM_COMPUTE_UNUSED(m); + ARM_COMPUTE_UNUSED(n); + ARM_COMPUTE_UNUSED(k); + + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k); + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != n); + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != k); + if(gemm_info.reinterpret_input_as_3d) + { + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != m); + } + + if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) + { + const unsigned int src2_dim0 = src2->dimension(0); + const unsigned int src2_dim1 = src2->dimension(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1); + if(gemm_info.broadcast_bias) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix"); + } + } + + if(dst->total_size() != 0) + { + const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); + } + + return Status{}; +} + +std::pair validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed) +{ + unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; + unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; + bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; + + Window win{}; + Window win_out{}; + bool window_changed = false; + + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. + if(reinterpret_input_as_3d == reinterpret_output_as_3d) + { + reinterpret_output_as_3d = false; + } + + // dst tensor auto initialization if not yet initialized + auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); + + TensorInfo tmp_info(*dst); + + if(reinterpret_output_as_3d) + { + // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, + // the window needs to be constructed on the 2D collapsed version of the tensor + TensorShape tmp_shape(dst->tensor_shape()); + tmp_shape.collapse(2U, 1U); + tmp_info.set_tensor_shape(tmp_shape); + } + + // Configure kernel window + num_elems_processed_per_iteration_x = rhs_info.n0; + num_elems_processed_per_iteration_y = lhs_info.m0; + + win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowStatic src0_access(src0, 0, 0, + src0->dimension(0), + src0->dimension(1)); + AccessWindowStatic src1_access(src1, 0, 0, + ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), + src1->dimension(1)); + AccessWindowStatic dst_access(dst, 0, 0, + dst->dimension(0), + dst->dimension(1)); + + if(src2 != nullptr) + { + const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; + + AccessWindowStatic src2_access(src2, 0, 0, + ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x), + src2->dimension(1)); + + window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop + update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor + } + else + { + window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop + update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor + } + + // Collapse along the Z direction + // This collapse needs to be here in order to tune the Z dimension of LWS + Window collapsed = win; + const unsigned int dimension_to_collapse = std::min(static_cast(dst->num_dimensions()), 2u); + collapsed = win.collapse(win, dimension_to_collapse); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, collapsed); +} +} // namespace + +void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); + + auto padding_info = get_padding_info({ src0, dst }); + _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; + _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; + _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); + _add_bias = src2 != nullptr; + + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. + if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) + { + _reinterpret_input_as_3d = false; + _reinterpret_output_as_3d = false; + } + + // Check if we need to slide the matrix B + const unsigned int num_dimensions_src0 = src0->num_dimensions(); + _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0); + + ElementsProcessed num_elements_processed{}; + + // Configure kernel window + auto win_config = validate_and_configure_window(src0, src1, src2 != nullptr ? src2 : nullptr, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + IClKernel::configure_internal(win_config.second); + + // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, + // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. + // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m + const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1); + + const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(1) : src0->dimension(1); + const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(2) : src0->dimension(2); + + // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. + const unsigned int partial_store_m0 = internal_m % lhs_info.m0; + const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0; + + // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads. + // NOTE: This might have implications on heuristics and performance + const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0); + + // Create build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); + build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); + build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); + build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); + build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS"); + build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); + build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); + build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); + build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m)); + build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n)); + build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k)); + build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); + build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); + build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); + + std::string kernel_name("gemm_mm_native"); + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name; + _config_id += "_"; + _config_id += (_add_bias ? "add_bias_" : ""); + _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : ""); + _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); + _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); + _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : ""); + _config_id += lower_string(string_from_data_type(src0->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(gemm_info.k); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.m0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.n0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.k0); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) +{ + ElementsProcessed num_elements_processed{}; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), + src1->clone().get(), + src2 != nullptr ? src2->clone().get() : nullptr, + dst->clone().get(), + lhs_info, + rhs_info, + gemm_info, + num_elements_processed) + .first); + + return Status{}; +} + +void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto src2 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr); + + if(src1->info()->num_dimensions() < 3) + { + // The stride_z for matrix B must be zero if we do not slice + ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); + } + + Window slice = window.first_slice_window_3D(); + Window slice_matrix_b = slice; + + slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); + slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + if(_reinterpret_input_as_3d) + { + // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor + unsigned int idx0; + if(_add_bias) + { + idx0 = 4 * num_arguments_per_2D_tensor() + 4; + } + else + { + idx0 = 3 * num_arguments_per_2D_tensor() + 3; + } + const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom; + _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); + } + + if(_reinterpret_output_as_3d) + { + // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor + unsigned int idx0; + if(_add_bias) + { + idx0 = 4 * num_arguments_per_2D_tensor() + 4 + (_reinterpret_input_as_3d ? 1 : 0); + } + else + { + idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); + } + const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; + _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); + } + + do + { + Window slice_b = slice; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the matrix multiplication is used to perform a convolution operation + if(!_slide_matrix_b) + { + slice_b = slice_matrix_b; + } + + unsigned int idx = 0; + add_2D_tensor_argument(idx, src0, slice); + add_2D_tensor_argument(idx, src1, slice_b); + if(_add_bias) + { + add_2D_tensor_argument(idx, src2, slice); + } + add_2D_tensor_argument(idx, dst, slice); + _kernel.setArg(idx++, static_cast(src0->info()->strides_in_bytes()[2])); + _kernel.setArg(idx++, static_cast(src1->info()->strides_in_bytes()[2])); + if(_add_bias) + { + _kernel.setArg(idx++, static_cast(src2->info()->strides_in_bytes()[2])); + } + _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[2])); + enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); + } + while(window.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h new file mode 100644 index 0000000000..4770b18b8e --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H +#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to multiply matrices when neither of the input matrices have been reshaped */ +class ClGemmMatrixMultiplyNativeKernel : public IClKernel +{ +public: + ClGemmMatrixMultiplyNativeKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyNativeKernel); + /** Initialise the kernel's input and dst. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src0 Input tensor for the LHS matrix. Data type supported: F32. The number of dimensions for the LHS matrix must be less or equal than 4. + * @param[in] src1 Input tensor for the RHS matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3. + * @param[in] src2 Input tensor containing the bias matrix. Data type supported: same as @p src0. + * @param[out] dst dst tensor info. Data type supported: same as @p src0 + * @param[in] alpha Weight of the matrix product + * @param[in] beta Weight of the matrix bias + * @param[in] lhs_info LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported: + * lhs_info.m0: 1,2,3,4,5,6,7,8 + * lhs_info.k0: 2,3,4,8,16 + * @param[in] rhs_info RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported: + * rhs_info.n0: 2,3,4,8,16 + * rhs_info.k0: same of lhs_info.k0 + * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmMatrixMultiplyNativeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + bool _slide_matrix_b{ true }; + bool _reinterpret_input_as_3d{ false }; + bool _reinterpret_output_as_3d{ false }; + bool _use_dummy_work_items{ false }; + bool _add_bias{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /*ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H*/ diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp new file mode 100644 index 0000000000..27409b66ac --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp @@ -0,0 +1,416 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/CL/CLUtils.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/utils/helpers/float_ops.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +#include +#include +#include + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +using ElementsProcessed = Steps; + +Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info) +{ + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose == rhs_info.transpose); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3), "Only 2,3,4,8,16 are supported for m0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) + && (!gemm_info.broadcast_bias), + "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type"); + ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info)); + + const unsigned int m = gemm_info.m; + const unsigned int n = gemm_info.n; + const unsigned int k = gemm_info.k; + + TensorShape tensor_shape0{ src0->tensor_shape() }; + tensor_shape0.set(0, k); + tensor_shape0.set(1, m); + + TensorShape tensor_shape1{ src1->tensor_shape() }; + tensor_shape1.set(0, n); + tensor_shape1.set(1, k); + + if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) + { + const unsigned int src2_dim0 = src2->dimension(0); + const unsigned int src2_dim1 = src2->dimension(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1); + if(gemm_info.broadcast_bias) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix"); + } + } + + const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0); + const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); + + const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info)); + const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); + + if(dst->total_size() != 0) + { + const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); + } + + return Status{}; +} + +std::pair validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed) +{ + unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; + unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; + bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; + + Window win{}; + Window win_out{}; + bool window_changed = false; + + // dst tensor auto initialization if not yet initialized + auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); + + TensorInfo tmp_info(*dst); + + if(reinterpret_output_as_3d) + { + // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, + // the window needs to be constructed on the 2D collapsed version of the tensor + TensorShape tmp_shape(dst->tensor_shape()); + tmp_shape.collapse(2U, 1U); + tmp_info.set_tensor_shape(tmp_shape); + } + + // Configure kernel window + num_elems_processed_per_iteration_x = rhs_info.n0; + num_elems_processed_per_iteration_y = lhs_info.m0; + + win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + if(src2 != nullptr) + { + const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; + + const int bias_processed_per_iteration_y = gemm_info.broadcast_bias ? 1 : num_elems_processed_per_iteration_y; + + AccessWindowStatic src2_access(src2, 0, 0, + ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x), + ceil_to_multiple(src2->dimension(1), bias_processed_per_iteration_y)); + + window_changed = update_window_and_padding(win, src2_access); // window used by the execute_window_loop + } + + // Collapse along the Z direction + // This collapse needs to be here in order to tune the Z dimension of LWS + Window collapsed = win; + const unsigned int dimension_to_collapse = std::min(static_cast(dst->num_dimensions()), 2u); + collapsed = win.collapse(win, dimension_to_collapse); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, collapsed); +} +} // namespace + +void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); + + auto padding_info = get_padding_info({ src0, dst }); + _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; + _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); + _add_bias = src2 != nullptr; + _export_to_cl_image = rhs_info.export_to_cl_image; + _k = gemm_info.k; + + // Check if we need to slide the matrix B + const unsigned int num_dimensions_src0 = src0->num_dimensions(); + _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0); + + ElementsProcessed num_elements_processed{}; + + // Configure kernel window + auto win_config = validate_and_configure_window(src0, src1, src2, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + const bool enable_mixed_precision = gemm_info.fp_mixed_precision; + const DataType data_type = src0->data_type(); + + // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. + const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1); + + const unsigned int partial_store_m0 = internal_m % lhs_info.m0; + const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0; + + // Create build options + CLBuildOptions build_opts; + build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); + build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); + build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); + build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS"); + build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); + build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE"); + build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); + build_opts.add_option_if(lhs_info.transpose, "-DLHS_TRANSPOSE"); + build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); + build_opts.add_option_if(enable_mixed_precision, "-DMIXED_PRECISION"); + build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT"); + build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1))); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); + build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision ? get_cl_type_from_data_type(DataType::F32) : get_cl_type_from_data_type(data_type))); + build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m)); + build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n)); + build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k)); + build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); + build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0)); + build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0)); + build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); + build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); + + std::string kernel_name("gemm_mm_reshaped_"); + kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_"; + kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt"; + kernel_name += rhs_info.export_to_cl_image ? "_texture" : ""; + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name; + _config_id += "_"; + _config_id += (_add_bias ? "add_bias_" : ""); + _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : ""); + _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); + _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : ""); + _config_id += lower_string(string_from_data_type(src0->data_type())); + _config_id += "_"; + _config_id += (enable_mixed_precision ? "mixed_precision_" : ""); + _config_id += support::cpp11::to_string(dst->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(gemm_info.k); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.m0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.n0); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.k0); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.v0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.h0); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.interleave); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.interleave); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) +{ + ElementsProcessed num_elements_processed{}; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), + src1->clone().get(), + src2 != nullptr ? src2->clone().get() : nullptr, + dst->clone().get(), + lhs_info, + rhs_info, + gemm_info, + num_elements_processed) + .first); + + return Status{}; +} + +void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto src2 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr); + + if(src1->info()->num_dimensions() < 3) + { + // The stride_z for matrix B must be zero if we do not slice + ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); + } + + Window slice = window.first_slice_window_3D(); + Window slice_matrix_b = slice; + + slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); + slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; + + cl::Image2D src1_image2d; + + if(_export_to_cl_image) + { + const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2)); + const size_t image_row_pitch = src1->info()->strides_in_bytes()[1]; + + src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch); + } + + do + { + Window slice_b = slice; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the matrix multiplication is used to perform a convolution operation + if(!_slide_matrix_b) + { + slice_b = slice_matrix_b; + } + + unsigned int idx = 0; + + // LHS buffer + add_2D_tensor_argument(idx, src0, slice); + + // RHS buffer or RHS OpenCL image (_export_to_cl_image == true) + if(_export_to_cl_image) + { + _kernel.setArg(idx++, src1_image2d); + } + else + { + add_2D_tensor_argument(idx, src1, slice_b); + } + + // Bias buffer (_add_bias == true) + add_2D_tensor_argument_if(_add_bias, idx, src2, slice); + + // dst buffer + add_2D_tensor_argument(idx, dst, slice); + + // K dimension (not used if _export_to_cl_image == true) + _kernel.setArg(idx++, static_cast(_k)); + + // LHS stride_z + _kernel.setArg(idx++, static_cast(src0->info()->strides_in_bytes()[2])); + + // RHS stride_z (not used if _export_to_cl_image == true) + _kernel.setArg(idx++, static_cast(src1->info()->strides_in_bytes()[2])); + + // Bias stride_z (if _add_bias == true) + if(_add_bias) + { + _kernel.setArg(idx++, static_cast(src2->info()->strides_in_bytes()[2])); + } + + // dst stride_z + _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[2])); + + // Cross-plan padding (if _reinterpret_output_as_3d = true) + if(_reinterpret_output_as_3d) + { + _kernel.setArg(idx++, static_cast(total_cross_plane_pad)); + } + + // Dispatch kernel + enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); + } + while(window.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h new file mode 100644 index 0000000000..ab648f15ae --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H +#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +#include "arm_compute/core/KernelDescriptors.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to multiply matrices when both the input matrices LHS (src0) and RHS (src1) have been reshaped + * + * @note The input matrices @p src0 and @p src1 must be reshaped through: + * - @ref ClGemmReshapeLhsMatrixKernel + * - @ref ClGemmReshapeRhsMatrixKernel + */ +class ClGemmMatrixMultiplyReshapedKernel : public IClKernel +{ +public: + ClGemmMatrixMultiplyReshapedKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyReshapedKernel); + /** Initialise the kernel's input and output. + * + * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag. + * Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the + * multiplications. i.e. float c = (half)a * (half)b + * + * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function. + * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer, + * the following conditions are required: + * -# rhs_info.n0 can only be 4, 8 and 16 + * -# rhs_info.k0 can only be 4, 8 and 16 + * -# Data type can only be F32 + * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension + * -# The stride Y for the src1 should satisfy the OpenCL pitch alignment requirement + * -# src1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4) + * -# src1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT + * + * @param[in] compile_context The compile context to be used. + * @param[in] src0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4 + * @param[in] src1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3 + * @param[in] src2 Input tensor containing the bias matrix. Data type supported: same as @p src0. + * @param[out] dst dst tensor to store the result of matrix multiplication. Data type supported: same as @p src0 + * @param[in] alpha Weight of the matrix product + * @param[in] beta Weight of the matrix bias + * @param[in] lhs_info LHS matrix information used for reshaping the src0 tensor. Only the following values are supported: + * lhs_info.m0: 2,3,4,5,6,7,8 + * lhs_info.k0: 2,3,4,8,16 + * lhs_info.transpose: false + * @param[in] rhs_info RHS matrix information used for reshaping the src1 tensor. Only the following values are supported: + * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true) + * rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true) + * rhs_info.transpose: true + * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices + * + * @note lhs_info.k0 must be equal to rhs_info.k0 + */ + void configure(const ClCompileContext &compile_context, + ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmMatrixMultiplyReshapedKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + bool _slide_matrix_b{ true }; + bool _reinterpret_output_as_3d{ false }; + bool _use_dummy_work_items{ false }; + bool _add_bias{ false }; + bool _export_to_cl_image{ false }; + unsigned int _k{ 1 }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H */ \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp new file mode 100644 index 0000000000..4eea2c6f76 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp @@ -0,0 +1,438 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/CL/CLUtils.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/utils/helpers/float_ops.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +using ElementsProcessed = Steps; + +Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) +{ + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1 || lhs_info.m0 > 8, "Only 1,2,3,4,5,6,7,8 are supported for m0"); + ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16 || rhs_info.k0 < 2); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16 || rhs_info.n0 < 2); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) + && (!gemm_info.broadcast_bias), + "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported"); + ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info)); + + const unsigned int m = gemm_info.m; + const unsigned int n = gemm_info.n; + const unsigned int k = gemm_info.k; + + TensorShape tensor_shape1{ src1->tensor_shape() }; + tensor_shape1.set(0, n); + tensor_shape1.set(1, k); + + if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) + { + const unsigned int src2_dim0 = src2->dimension(0); + const unsigned int src2_dim1 = src2->dimension(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src0); + if(gemm_info.broadcast_bias) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix"); + } + } + + const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); + + const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k); + if(gemm_info.reinterpret_input_as_3d) + { + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != m); + } + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); + + if(dst->total_size() != 0) + { + const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); + } + + return Status{}; +} + +std::pair validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed) +{ + unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; + unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; + bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; + + Window win{}; + Window win_out{}; + bool window_changed = false; + + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. + // This approach should only be used when the input/dst tensors have pad on the y direction + if((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y) + { + reinterpret_output_as_3d = false; + } + + // dst tensor auto initialization if not yet initialized + auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); + + TensorInfo tmp_info(*dst); + + if(reinterpret_output_as_3d) + { + // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, + // the window needs to be constructed on the 2D collapsed version of the tensor + TensorShape tmp_shape(dst->tensor_shape()); + tmp_shape.collapse(2U, 1U); + tmp_info.set_tensor_shape(tmp_shape); + } + + // Configure kernel window + num_elems_processed_per_iteration_x = rhs_info.n0; + num_elems_processed_per_iteration_y = lhs_info.m0; + + win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + if(src2 != nullptr) + { + const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; + + AccessWindowStatic src2_access(src2, 0, 0, + ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x), + src2->dimension(1)); + + window_changed = update_window_and_padding(win, src2_access); + } + + // Collapse along the Z direction + // This collapse needs to be here in order to tune the Z dimension of LWS + Window collapsed = win; + const unsigned int dimension_to_collapse = std::min(static_cast(dst->num_dimensions()), 2u); + collapsed = win.collapse(win, dimension_to_collapse); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, collapsed); +} +} // namespace + +void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); + + _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; + _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; + _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); + _add_bias = src2 != nullptr; + _export_to_cl_image = rhs_info.export_to_cl_image; + _has_pad_y = gemm_info.has_pad_y; + + auto padding_info = get_padding_info({ src0, src1, dst }); + + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. + if((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y) + { + _reinterpret_input_as_3d = false; + _reinterpret_output_as_3d = false; + } + + // Check if we need to slide the matrix B + const unsigned int num_dimensions_src0 = src0->num_dimensions(); + _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0); + + ElementsProcessed num_elements_processed{}; + + // Configure kernel window + auto win_config = validate_and_configure_window(src0, src1, src2, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // If _reinterpret_input_as_3d = reinterpret_output_as_3d = true, + // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. + // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m + const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1); + + // These variables are used only if gemm_info.has_pad_y == true + const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(1) : src0->dimension(1); + const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(2) : src0->dimension(2); + + // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads. + // NOTE: This might have implications on heuristics and performance + const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0); + + // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. + const unsigned int partial_store_m0 = internal_m % internal_m0; + const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0; + + // Create build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); + build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); + build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); + build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); + build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS"); + build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); + build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); + build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); + build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT"); + build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1))); + build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m)); + build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n)); + build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k)); + build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); + build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); + build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); + build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); + if(_has_pad_y) + { + build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); + } + + std::string kernel_name("gemm_mm_reshaped_only_rhs_"); + kernel_name += rhs_info.transpose ? "t" : "nt"; + kernel_name += rhs_info.export_to_cl_image ? "_texture" : ""; + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name; + _config_id += "_"; + _config_id += (_has_pad_y ? "" : "no_pad_y_"); + _config_id += (_add_bias ? "add_bias_" : ""); + _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : ""); + _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); + _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); + _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : ""); + _config_id += lower_string(string_from_data_type(src0->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(gemm_info.k); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.m0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.n0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.k0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.h0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.interleave); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) +{ + ElementsProcessed num_elements_processed{}; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), + src1->clone().get(), + src2 != nullptr ? src2->clone().get() : nullptr, + dst->clone().get(), + lhs_info, + rhs_info, + gemm_info, + num_elements_processed) + .first); + + return Status{}; +} + +void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src0 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto src2 = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr); + + if(src1->info()->num_dimensions() < 3) + { + // The stride_z for matrix B must be zero if we do not slice + ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); + } + + const size_t lhs_idx_batch_size = _reinterpret_input_as_3d && !_has_pad_y ? 3u : 2u; + const size_t rhs_idx_batch_size = 2u; + const size_t bia_idx_batch_size = 2u; + const size_t out_idx_batch_size = _reinterpret_output_as_3d && !_has_pad_y ? 3u : 2u; + + Window slice = window.first_slice_window_3D(); + Window slice_matrix_b = slice; + + slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); + slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + // Get cross plane pads + const unsigned int total_cross_plane_pad_lhs = src0->info()->padding().top + src0->info()->padding().bottom; + const unsigned int total_cross_plane_pad_out = dst->info()->padding().top + dst->info()->padding().bottom; + + // The execution should fail if we try to run with has_pad_y = false but we have padding in either the LHS or DST tensor + ARM_COMPUTE_ERROR_ON(!_has_pad_y && ((total_cross_plane_pad_lhs != 0) || (total_cross_plane_pad_out != 0))); + + cl::Image2D src1_image2d; + + if(_export_to_cl_image) + { + const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2)); + const size_t image_row_pitch = src1->info()->strides_in_bytes()[1]; + + src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch); + } + + do + { + Window slice_b = slice; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the matrix multiplication is used to perform a convolution operation + if(!_slide_matrix_b) + { + slice_b = slice_matrix_b; + } + + unsigned int idx = 0; + + // LHS buffer + add_2D_tensor_argument(idx, src0, slice); + + // RHS buffer or RHS OpenCL image (_export_to_cl_image == true) + if(_export_to_cl_image) + { + _kernel.setArg(idx++, src1_image2d); + } + else + { + add_2D_tensor_argument(idx, src1, slice_b); + } + + // Bias buffer (_add_bias == true) + add_2D_tensor_argument_if(_add_bias, idx, src2, slice); + + // dst buffer + add_2D_tensor_argument(idx, dst, slice); + + // LHS stride_z + _kernel.setArg(idx++, static_cast(src0->info()->strides_in_bytes()[lhs_idx_batch_size])); + + // RHS stride_z (not used if _export_to_cl_image == true) + _kernel.setArg(idx++, static_cast(src1->info()->strides_in_bytes()[rhs_idx_batch_size])); + + // Bias stride_z (if _add_bias == true) + if(_add_bias) + { + _kernel.setArg(idx++, static_cast(src2->info()->strides_in_bytes()[bia_idx_batch_size])); + } + + // dst stride_z + _kernel.setArg(idx++, static_cast(dst->info()->strides_in_bytes()[out_idx_batch_size])); + + // Cross-plan padding (if _reinterpret_input_as_3d = true) + if(_reinterpret_input_as_3d && _has_pad_y) + { + _kernel.setArg(idx++, static_cast(total_cross_plane_pad_lhs)); + } + + // Cross-plan padding (if reinterpret_output_as_3d = true) + if(_reinterpret_output_as_3d && _has_pad_y) + { + _kernel.setArg(idx++, static_cast(total_cross_plane_pad_out)); + } + + enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); + } + while(window.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h new file mode 100644 index 0000000000..ff6c391e15 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H +#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +#include "arm_compute/core/KernelDescriptors.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to multiply matrices when only the input matrix RHS (src1) has been reshaped + * + * @note The input matrix src1 must be reshaped through @ref ClGemmReshapeRhsMatrixKernel + */ +class ClGemmMatrixMultiplyReshapedOnlyRhsKernel : public ICLKernel +{ +public: + ClGemmMatrixMultiplyReshapedOnlyRhsKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyReshapedOnlyRhsKernel); + /** Initialise the kernel's input and output. + * + * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function. + * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer, + * the following conditions are required: + * -# rhs_info.n0 can only be 4, 8 and 16 + * -# rhs_info.k0 can only be 4, 8 and 16 + * -# Data type can only be F32 + * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension + * -# The stride Y for the src1 should satisfy the OpenCL pitch alignment requirement + * -# src1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4) + * -# src1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT + * + * @param[in] compile_context The compile context to be used. + * @param[in] src0 Input tensor containing the LHS matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true). + * The number of dimensions for the LHS matrix must be less or equal than 4. + * @param[in] src1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3. + * @param[in] src2 Input tensor containing the bias matrix. Data type supported: same as @p src0. + * @param[out] dst Output tensor to store the result of matrix multiplication. Data type supported: same as @p src0 + * @param[in] alpha Weight of the matrix product + * @param[in] beta Weight of the matrix bias + * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread. Only the following values are supported: + * lhs_info.m0: 1,2,3,4,5,6,7,8 + * @param[in] rhs_info RHS matrix information used for reshaping the src1 tensor. Only the following values are supported: + * rhs_info.k0: 2,3,4,8,16 + * rhs_info.n0: 2,3,4,8,16 + * rhs_info.transpose: true,false + * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices + */ + void configure(const ClCompileContext &compile_context, + ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + bool _slide_matrix_b{ true }; + bool _reinterpret_input_as_3d{ false }; + bool _reinterpret_output_as_3d{ false }; + bool _use_dummy_work_items{ false }; + bool _add_bias{ false }; + bool _export_to_cl_image{ false }; + bool _has_pad_y{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */ diff --git a/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp b/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp new file mode 100644 index 0000000000..98161edfff --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 == 0); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 == 0); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.v0 == 0); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8); + + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), + misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + } + + return Status{}; +} + +std::pair validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d) +{ + const unsigned int num_elems_processed_per_iteration_x = lhs_info.k0; + const unsigned int num_elems_processed_per_iteration_y = lhs_info.m0; + bool window_changed = false; + + TensorInfo tmp_info(*src); + + if(reinterpret_input_as_3d) + { + // Since the src tensor has to be reinterpreted as 3D and the execute window is based on a 2D interleave, + // the window needs to be constructed on the 2D collapsed version of the tensor + TensorShape tmp_shape(src->tensor_shape()); + tmp_shape.collapse(2U, 1U); + tmp_info.set_tensor_shape(tmp_shape); + } + + // dst auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d))); + + // Configure window + Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + Window win_in = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowStatic src_access(src, 0, 0, + src->dimension(0), + src->dimension(1)); + AccessWindowStatic dst_access(dst, 0, 0, dst->dimension(0), dst->dimension(1)); + + window_changed = update_window_and_padding(win_in, src_access) || // window used by the execute_window_loop + update_window_and_padding(win, dst_access); // window used to update the padding requirements of dst tensor + + // Collapse along the Z direction + // This collapse needs to be here in order to tune the Z dimension of LWS + Window collapsed = win.collapse(win, Window::DimZ); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, collapsed); +} +} // namespace + +void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d)); + + auto padding_info = get_padding_info({ src }); + + _reinterpret_input_as_3d = reinterpret_input_as_3d; + + const unsigned int src_w = src->dimension(0); + const unsigned int src_h = _reinterpret_input_as_3d ? src->dimension(1) * src->dimension(2) : src->dimension(1); + const unsigned int partial_load_m0 = src_h % lhs_info.m0; + const unsigned int partial_load_k0 = src_w % lhs_info.k0; + + // Create build options + CLBuildOptions build_opts; + build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0)); + build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0)); + build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0)); + build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_w)); + build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_h)); + build_opts.add_option_if(lhs_info.interleave, "-DINTERLEAVE"); + build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_input_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(src->dimension(1))); + build_opts.add_option_if(_reinterpret_input_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(src->dimension(2))); + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size())); + build_opts.add_option("-DPARTIAL_LOAD_M0=" + support::cpp11::to_string(partial_load_m0)); + build_opts.add_option("-DPARTIAL_LOAD_K0=" + support::cpp11::to_string(partial_load_k0)); + + std::string kernel_name("gemm_reshape_lhs_matrix_"); + kernel_name += lhs_info.transpose ? "t" : "nt"; + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Configure kernel window + auto win_config = validate_and_configure_window(src, dst, lhs_info, reinterpret_input_as_3d); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // Set config_id for enabling LWS tuning + _config_id = "gemm_reshape_lhs_matrix_"; + _config_id += (_reinterpret_input_as_3d ? "3d_" : ""); + _config_id += lower_string(string_from_data_type(src->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.m0); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.k0); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.v0); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.interleave); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.transpose); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmReshapeLhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), lhs_info, reinterpret_input_as_3d).first); + + return Status{}; +} + +void ClGemmReshapeLhsMatrixKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + Window slice = window.first_slice_window_3D(); + + if(_reinterpret_input_as_3d) + { + // Pass bottom paddings to the kernel if the src has to be reinterpreted as 3D tensor + const unsigned int idx0 = 2 * num_arguments_per_3D_tensor(); + const unsigned int total_cross_plane_pad = src->info()->padding().top + src->info()->padding().bottom; + _kernel.setArg(idx0, static_cast(total_cross_plane_pad)); + } + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(window.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h b/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h new file mode 100644 index 0000000000..b830ba02b4 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H +#define ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to reshape the LHS matrix when performing the matrix multiplication. + * In particular, this function splits the src matrix in blocks of size M0xK0 (defined through GEMMLHSInfo) and + * stores each one in the dst matrix unrolling the values + */ +class ClGemmReshapeLhsMatrixKernel : public ICLKernel +{ +public: + ClGemmReshapeLhsMatrixKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmReshapeLhsMatrixKernel); + /** Initialise the kernel's input and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Input tensor. Data types supported: All + * @param[out] dst Output tensor. Data type supported: same as @p src + * @param[in] lhs_info LHS matrix information to be used for reshaping. This object contains all the necessary + * information to reshape the src tensor. Only the following values are supported: + * lhs_info.m0: 2,3,4,5,6,7,8 + * lhs_info.k0: 2,3,4,8,16 + * lhs_info.v0: greater than 0 + * lhs_info.transpose: true, false + * lhs_info.interleave: true, false + * @param[in] reinterpret_src_as_3d (Optional) True if the src has to be reinterpreted as 3D tensor + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d = false); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmReshapeLhsMatrixKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + bool _reinterpret_input_as_3d{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H */ \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp b/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp new file mode 100644 index 0000000000..e1ef7c61aa --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 == 0); + ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 == 0); + ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.h0 == 0); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)), "Only 1,2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16); + ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16); + ARM_COMPUTE_RETURN_ERROR_ON((rhs_info.k0 == 1) && (rhs_info.transpose)); + + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + + if(rhs_info.export_to_cl_image) + { + const TensorInfo tensor_reshaped_info(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info), 1, src->data_type()); + ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info)); + } + + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + } + + return Status{}; +} + +std::pair validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info) +{ + const unsigned int num_elems_processed_per_iteration_x = rhs_info.n0; + const unsigned int num_elems_processed_per_iteration_y = rhs_info.k0; + bool window_changed = false; + + // dst auto initialization if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info))); + + // Configure window + Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowRectangle src_access(src, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + + window_changed = update_window_and_padding(win, src_access); + + if(rhs_info.export_to_cl_image) + { + gemm::update_padding_for_cl_image(dst); + } + + // Collapse along the Z direction + // This collapse needs to be here in order to tune the Z dimension of LWS + Window collapsed = win.collapse(win, Window::DimZ); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, collapsed); +} +} // namespace + +void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, rhs_info)); + + // Create build options + CLBuildOptions build_opts; + build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); + build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); + build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); + build_opts.add_option_if(rhs_info.transpose, "-DTRANSPOSE"); + build_opts.add_option_if(rhs_info.interleave, "-DINTERLEAVE"); + build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(1))); + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size())); + + std::string kernel_name("gemm_reshape_rhs_matrix_"); + kernel_name += rhs_info.transpose ? "t" : "nt"; + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Configure kernel window + auto win_config = validate_and_configure_window(src, dst, rhs_info); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); +} + +Status ClGemmReshapeRhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, rhs_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), rhs_info).first); + + return Status{}; +} + +void ClGemmReshapeRhsMatrixKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + Window slice = window.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(window.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h b/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h new file mode 100644 index 0000000000..e877d87408 --- /dev/null +++ b/src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H +#define ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/core/gpu/cl/ClCompileContext.h" +#include "src/core/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to reshape the RHS matrix when performing the matrix multiplication + * In particular, this kernel splits the src matrix in blocks of size K0xN0 and stores each one in + * the dst matrix unrolling the values */ +class ClGemmReshapeRhsMatrixKernel : public ICLKernel +{ +public: + ClGemmReshapeRhsMatrixKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmReshapeRhsMatrixKernel); + /** Initialise the kernel's input and output. + * + * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor, + * required to create a OpenCL image object from buffer in @ref ClGemmMatrixMultiplyReshapedKernel and in @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel + * Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required: + * -# rhs_info.n0 can only be 4, 8 and 16 + * -# rhs_info.k0 can only be 4, 8 and 16 + * -# Data type can only be F32, F16 + * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension + * -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4) + * -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT + * -# The output tensor should be only consumed by @ref ClGemmMatrixMultiplyReshapedKernel or @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Input tensor. Data types supported: All + * @param[out] dst Output tensor. Data type supported: same as @p src + * @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary + * information to reshape the src tensor. Only the following values are supported: + * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true) + * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false), (only 4, 8 and 16 if rhs_info.export_to_cl_image == true) + * rhs_info.h0: greater than 0 + * rhs_info.transpose: true, false + * rhs_info.interleave: true, false + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmReshapeRhsMatrixKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H */ \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.cpp b/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.cpp new file mode 100644 index 0000000000..0a8ba971ed --- /dev/null +++ b/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.cpp @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +std::pair configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, + bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image) +{ + ARM_COMPUTE_ERROR_ON(m0 == 0 || n0 == 0); + v0 = std::max(std::min(static_cast(m / m0), static_cast(v0)), static_cast(1)); + h0 = std::max(std::min(static_cast(n / n0), static_cast(h0)), static_cast(1)); + + const GEMMLHSMatrixInfo lhs_info(m0, k0, v0, lhs_transpose, lhs_interleave); + const GEMMRHSMatrixInfo rhs_info(n0, k0, h0, rhs_transpose, rhs_interleave, export_to_cl_image); + + return std::make_pair(lhs_info, rhs_info); +} + +std::pair select_lhs_rhs_info(std::pair info_img, + std::pair info_buf, + unsigned int n, unsigned int k, unsigned int b, DataType data_type) +{ + const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, data_type); + const TensorShape shape = misc::shape_calculator::compute_rhs_reshaped_shape(tensor_rhs_info, info_img.second); + const TensorInfo tensor_reshaped_info(shape, 1, data_type); + + if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second))) + { + return info_img; + } + else + { + return info_buf; + } +} + +void update_padding_for_cl_image(ITensorInfo *tensor) +{ + constexpr unsigned int num_floats_per_pixel = 4; + + const unsigned int stride_y_in_elements = tensor->strides_in_bytes()[1] / tensor->element_size(); + const unsigned int pixel_alignment = get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()); + + ARM_COMPUTE_ERROR_ON_MSG(pixel_alignment == 0, "Cannot retrieve cl_image pitch alignment"); + if(pixel_alignment == 0) + { + return; + } + + const unsigned int row_pitch_alignment = pixel_alignment * num_floats_per_pixel; + const unsigned int round_up_width = ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment; + const unsigned int padding = round_up_width - stride_y_in_elements; + + tensor->extend_padding(PaddingSize(0, padding, 0, 0)); +} + +Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info) +{ + if(rhs_info.export_to_cl_image) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.n0 == 2) || (rhs_info.n0 == 3), "Export to cl_image only supported with n0 = 4, 8 or 16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.k0 == 2) || (rhs_info.k0 == 3), "Export to cl_image only supported with k0 = 4, 8 or 16"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(&tensor_reshaped_info, DataType::F32, DataType::F16); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, "Impossible to retrieve the cl_image pitch alignment"); + + // Check the width and height of the output tensor. + // Since we cannot create a 3d image from a buffer, the third dimension is collapsed on the second dimension + const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo(); + const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo(); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4, "Not supported width for cl_image"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h, "Not supported height for cl_image"); + } + + return Status{}; +} +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h b/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h new file mode 100644 index 0000000000..3fce8c9173 --- /dev/null +++ b/src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_HELPERS_H +#define ARM_COMPUTE_CL_GEMM_HELPERS_H + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** Configure @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo + * + * @param[in] m Number of rows (M) in the LHS matrix not reshaped + * @param[in] n Number of columns (N) in the RHS matrix not reshaped + * @param[in] m0 Number of rows processed by each thread/work-item + * @param[in] n0 Number of columns processed by each thread/work-item + * @param[in] k0 Number of inner accumulation performed by each thread/work-item + * @param[in] v0 Number of vertical blocks of size (m0xk0) stored on the same output row + * @param[in] h0 Number of horizontal blocks of size (k0xn0) stored on the same output row + * @param[in] lhs_interleave True if the v0 (m0xk0) blocks have to be interleaved in the output row + * @param[in] rhs_interleave True if the h0 (k0xn0) blocks have to be interleaved in the output row + * @param[in] lhs_transpose True if the (m0xk0) block has to be transposed before been stored + * @param[in] rhs_transpose True if the (k0xn0) block has to be transposed before been stored + * @param[in] export_to_cl_image (Optional) True if the RHS reshaped matrix has to be exported to cl_image + * + * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo + */ +std::pair configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, + bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image = false); + +/** Select @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo + * + * This function accepts two pairs of GEMMLHSMatrixInfo/GEMMRHSMatrixInfo where only the first is with cl_image2d support, + * and selects the valid one validating the GEMMRHSMatrixInfo. If the validation passes, the functions will return + * the first GEMMLHSMatrixInfo/GEMMRHSMatrixInfo pair with cl_image2d support. + * + * @param[in] info_img GEMMLHSMatrixInfo/GEMMRHSMatrixInfo with cl_image2d support + * @param[in] info_buf GEMMLHSMatrixInfo/GEMMRHSMatrixInfo to fall-back if cl_image2d cannot be used + * @param[in] n Number of columns (N) in the RHS matrix not reshaped + * @param[in] k Number of rows (K) in the RHS matrix not reshaped + * @param[in] b Batch size + * @param[in] data_type Data type + * + * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo + */ +std::pair select_lhs_rhs_info(std::pair info_img, + std::pair info_buf, + unsigned int n, unsigned int k, unsigned int b, DataType data_type); + +/** Update padding required to export the OpenCL buffer to OpenCL image2d + * + * @param[in,out] tensor ITensorInfo of the tensor required to be exported to OpenCL image2d + */ +void update_padding_for_cl_image(ITensorInfo *tensor); + +/** Utility function to validate the image2d OpenCL object support on the RHS reshaped matrix + * + * @param[in] tensor_reshaped_info TensorInfo for the RHS reshaped matrix + * @param[in] rhs_info @ref GEMMRHSMatrixInfo + * + * @return Status reporting if we can use the image2d OpenCL object on the RHS reshaped matrix + */ +Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info); +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_HELPERS_H */ diff --git a/src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h b/src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h new file mode 100644 index 0000000000..a49836cfda --- /dev/null +++ b/src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_ICL_GEMM_KERNEL_CONFIG_H +#define ARM_COMPUTE_ICL_GEMM_KERNEL_CONFIG_H + +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/Types.h" +#include "src/core/common/Macros.h" + +#include +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** Basic container for the OpenCL GEMM configuration functions */ +template +class CLGEMMConfigArray +{ +public: + /** Alias for F32 index */ + static constexpr size_t DT_F32 = 0; + /** Alias for F16 index */ + static constexpr size_t DT_F16 = 1; + /** Alias for Int8 index */ + static constexpr size_t DT_INT8 = 2; + + /** Constructor + * + * @param[in] func_f32 Function to call for GEMM F32 + * @param[in] func_f16 Function to call for GEMM F16 + * @param[in] func_int8 Function to call for GEMM Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) + * + */ + CLGEMMConfigArray(T func_f32, T func_f16, T func_int8) + : _configs{ func_f32, func_f16, func_int8 } + { + } + + /** Method to return the GEMM configuration function based on data type + * + * @param[in] data_type Input data type + * + * @return the valid function otherwise it returns nullptr if the data type is not valid + */ + T get_function(DataType data_type) + { + switch(data_type) + { + case DataType::F32: + return _configs.at(DT_F32); + case DataType::F16: + return _configs.at(DT_F16); + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + return _configs.at(DT_INT8); + default: + return nullptr; + } + } + +private: + std::array _configs; +}; + +/** Basic interface for the GEMM kernel configuration */ +class IClGemmKernelConfig +{ +public: + /** Constructor + * + * @param[in] arch GPU target + */ + IClGemmKernelConfig(GPUTarget arch) + : _target(arch) + { + } + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClGemmKernelConfig); + /** Virtual destructor */ + virtual ~IClGemmKernelConfig() = default; + /** Given M, N, K and B, this method returns the @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo to be used + * + * @param[in] m Number of rows LHS matrix + * @param[in] n Number of columns RHS matrix + * @param[in] k Number of columns LHS matrix or number of rows RHS matrix + * @param[in] b Batch size + * @param[in] data_type Data type + */ + virtual std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0; + +protected: + GPUTarget _target; +}; +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_ICL_GEMM_KERNEL_CONFIG_H */ diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp new file mode 100644 index 0000000000..9d11006703 --- /dev/null +++ b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +ClGemmDefaultConfigNativeBifrost::ClGemmDefaultConfigNativeBifrost(GPUTarget gpu) + : IClGemmKernelConfig(gpu) +{ +} + +std::pair ClGemmDefaultConfigNativeBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +{ + using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigNativeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, + unsigned int b); + + CLGEMMConfigArray configs_G71(&ClGemmDefaultConfigNativeBifrost::configure_G71_f32, + &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, // We use the F32 heuristic + &ClGemmDefaultConfigNativeBifrost::configure_G71_u8); + + CLGEMMConfigArray configs_G76(&ClGemmDefaultConfigNativeBifrost::configure_G76_f32, + &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, // We use the F32 heuristic + &ClGemmDefaultConfigNativeBifrost::configure_G76_u8); + + CLGEMMConfigArray configs_G7x(&ClGemmDefaultConfigNativeBifrost::configure_default_f32, + &ClGemmDefaultConfigNativeBifrost::configure_default_f32, // We use the F32 heuristic + &ClGemmDefaultConfigNativeBifrost::configure_default_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + + switch(_target) + { + case GPUTarget::G76: + func = configs_G76.get_function(data_type); + break; + case GPUTarget::G71: + func = configs_G71.get_function(data_type); + break; + default: + func = configs_G7x.get_function(data_type); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); + return (this->*func)(m, n, k, b); +} + +std::pair ClGemmDefaultConfigNativeBifrost::configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + if(n < 2048) + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); + } + else if(n >= 2048 && n < 8192) + { + return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 1, false, false, false, false); + } + } + else + { + return configure_lhs_rhs_info(m, n, 5, 4, 2, 1, 1, false, false, false, false); + } +} + +std::pair ClGemmDefaultConfigNativeBifrost::configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(dot8_supported(CLKernelLibrary::get().get_device())) + { + if(m == 1) + { + if(n < 2048) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false); + } + else if(n >= 2048 && n < 16384) + { + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false); + } + } + else + { + if(m < 64) + { + return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false); + } + } + } + else + { + if(m == 1) + { + if(n < 8192) + { + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false); + } + } + else + { + return configure_lhs_rhs_info(m, n, 2, 8, 16, 1, 1, false, false, false, false); + } + } +} + +std::pair ClGemmDefaultConfigNativeBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + if(n > 4196) + { + return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 1, false, false, false, false); + } + else + { + if(k < 2048) + { + return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 1, false, false, false, false); + } + else if(k >= 2048 && k < 16384) + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 1, false, false, false, false); + } + } + } + else + { + return configure_lhs_rhs_info(m, n, 2, 8, 2, 1, 1, false, false, false, false); + } +} + +std::pair ClGemmDefaultConfigNativeBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + if(n < 2048) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false); + } + else if(n >= 2048 && n < 16384) + { + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false); + } + } + else + { + if(m < 64) + { + return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false); + } + } +} + +std::pair ClGemmDefaultConfigNativeBifrost::configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 1, false, false, false, false); +} + +std::pair ClGemmDefaultConfigNativeBifrost::configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false); +} +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h new file mode 100644 index 0000000000..385b96e40e --- /dev/null +++ b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_BIFROST_H +#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_BIFROST_H + +#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** Bifrost based OpenCL GEMMNative configuration */ +class ClGemmDefaultConfigNativeBifrost final : public IClGemmKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClGemmDefaultConfigNativeBifrost(GPUTarget gpu); + + // Inherited overridden method + std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + +private: + std::pair configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); +}; +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_BIFROST_H */ diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp new file mode 100644 index 0000000000..e3c129e3be --- /dev/null +++ b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +ClGemmDefaultConfigNativeMidgard::ClGemmDefaultConfigNativeMidgard(GPUTarget gpu) + : IClGemmKernelConfig(gpu) +{ +} + +std::pair ClGemmDefaultConfigNativeMidgard::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +{ + using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigNativeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, + unsigned int b); + + CLGEMMConfigArray configs_default(nullptr, + nullptr, + &ClGemmDefaultConfigNativeMidgard::default_q8); + + auto func = configs_default.get_function(data_type); + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); + return (this->*func)(m, n, k, b); +} + +std::pair ClGemmDefaultConfigNativeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + const unsigned int m0 = std::min(m, static_cast(4)); + const unsigned int n0 = std::min(n, static_cast(4)); + + return configure_lhs_rhs_info(m, n, m0, n0, 2, 1, 1, false, false, false, false); +} +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h new file mode 100644 index 0000000000..0ff5471f7c --- /dev/null +++ b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_MIDGARD_H +#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_MIDGARD_H + +#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** Midgard based OpenCL GEMMNative configuration */ +class ClGemmDefaultConfigNativeMidgard final : public IClGemmKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClGemmDefaultConfigNativeMidgard(GPUTarget gpu); + + // Inherited overridden method + std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + +private: + std::pair default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); +}; +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_MIDGARD_H */ diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp new file mode 100644 index 0000000000..92767aca52 --- /dev/null +++ b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +ClGemmDefaultConfigNativeValhall::ClGemmDefaultConfigNativeValhall(GPUTarget gpu) + : IClGemmKernelConfig(gpu) +{ +} + +std::pair ClGemmDefaultConfigNativeValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +{ + using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigNativeValhall::*)(unsigned int m, unsigned int n, unsigned int k, + unsigned int b); + + CLGEMMConfigArray configs_default(&ClGemmDefaultConfigNativeValhall::configure_G77_f32, + &ClGemmDefaultConfigNativeValhall::configure_G77_f16, + &ClGemmDefaultConfigNativeValhall::configure_G77_u8); + + auto func = configs_default.get_function(data_type); + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); + return (this->*func)(m, n, k, b); +} + +std::pair ClGemmDefaultConfigNativeValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + if(n < 2048) + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); + } + else if(n >= 2048 && n < 8192) + { + return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 1, false, false, false, false); + } + } + else + { + return configure_lhs_rhs_info(m, n, 5, 4, 2, 1, 1, false, false, false, false); + } +} + +std::pair ClGemmDefaultConfigNativeValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + if(n < 2048) + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); + } + else if(n >= 2048 && n < 8192) + { + return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 1, false, false, false, false); + } + } + else + { + return configure_lhs_rhs_info(m, n, 4, 8, 2, 1, 1, false, false, false, false); + } +} + +std::pair ClGemmDefaultConfigNativeValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(dot8_supported(CLKernelLibrary::get().get_device())) + { + if(m == 1) + { + if(n < 2048) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false); + } + else if(n >= 2048 && n < 16384) + { + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false); + } + } + else + { + if(m < 64) + { + return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false); + } + } + } + else + { + if(m == 1) + { + if(n < 8192) + { + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false); + } + } + else + { + return configure_lhs_rhs_info(m, n, 2, 8, 16, 1, 1, false, false, false, false); + } + } +} +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute \ No newline at end of file diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h new file mode 100644 index 0000000000..17e4c9d339 --- /dev/null +++ b/src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_VALHALL_H +#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_VALHALL_H + +#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** Valhall based OpenCL GEMMNative configuration */ +class ClGemmDefaultConfigNativeValhall final : public IClGemmKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClGemmDefaultConfigNativeValhall(GPUTarget gpu); + + // Inherited overridden method + std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + +private: + std::pair configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); +}; +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_VALHALL_H */ diff --git a/src/core/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h b/src/core/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h new file mode 100644 index 0000000000..ff6a0128af --- /dev/null +++ b/src/core/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_NATIVE_KERNEL_CONFIGURATION_H +#define ARM_COMPUTE_CL_GEMM_NATIVE_KERNEL_CONFIGURATION_H + +#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" +#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h" +#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h" +#include "src/core/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h" + +#include + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** CLGEMMNative factory class */ +class ClGemmNativeKernelConfigurationFactory final +{ +public: + /** Static method to construct CLGEMMNative kernel object accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return CLGEMMNative kernel configuration class + */ + static std::unique_ptr create(GPUTarget gpu) + { + switch(get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + return std::make_unique(gpu); + case GPUTarget::BIFROST: + return std::make_unique(gpu); + case GPUTarget::VALHALL: + return std::make_unique(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /*ARM_COMPUTE_CL_GEMM_NATIVE_KERNEL_CONFIGURATION_H */ diff --git a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp new file mode 100644 index 0000000000..b030913a87 --- /dev/null +++ b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp @@ -0,0 +1,356 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +using namespace arm_compute::misc::shape_calculator; + +ClGemmDefaultConfigReshapedBifrost::ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu) + : IClGemmKernelConfig(gpu) +{ +} + +std::pair ClGemmDefaultConfigReshapedBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +{ + using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigReshapedBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + + CLGEMMConfigArray configs_G7x(&ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32, + &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16, + &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8); + + CLGEMMConfigArray configs_G52(&ClGemmDefaultConfigReshapedBifrost::configure_G52_f32, + &ClGemmDefaultConfigReshapedBifrost::configure_G52_f16, + &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8); + + CLGEMMConfigArray configs_G76(&ClGemmDefaultConfigReshapedBifrost::configure_G76_f32, + &ClGemmDefaultConfigReshapedBifrost::configure_G76_f16, + &ClGemmDefaultConfigReshapedBifrost::configure_G76_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + + switch(_target) + { + case GPUTarget::G76: + func = configs_G76.get_function(data_type); + break; + case GPUTarget::G52: + func = configs_G52.get_function(data_type); + break; + default: + func = configs_G7x.get_function(data_type); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); + return (this->*func)(m, n, k, b); +} + +std::pair ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(n <= 4) + { + return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true); + } + else + { + return configure_lhs_rhs_info(m, n, 5, 4, 4, 2, 16, false, true, false, true); + } +} + +std::pair ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(n <= 4) + { + return configure_lhs_rhs_info(m, n, 4, 2, 8, 8, 2, true, true, true, false); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 8, 4, 4, 2, true, true, true, false); + } +} + +std::pair ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(dot8_supported(CLKernelLibrary::get().get_device())) + { + if(n <= 4) + { + return configure_lhs_rhs_info(m, n, 4, 2, 16, 2, 2, true, false, false, true); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, true, false, false, true); + } + } + else + { + if(n <= 4) + { + return configure_lhs_rhs_info(m, n, 4, 2, 8, 2, 2, true, false, false, true); + } + else + { + return configure_lhs_rhs_info(m, n, 6, 4, 4, 2, 2, true, true, false, true); + } + } +} + +std::pair ClGemmDefaultConfigReshapedBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + const float r_mn = static_cast(m) / static_cast(n); + const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; + const float r_mk = static_cast(m) / static_cast(k); + const float r_nk = static_cast(n) / static_cast(k); + + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + if(workload <= 274.4000f) + { + if(r_nk <= 0.7461f) + { + if(r_mn <= 21.1667f) + { + return configure_lhs_rhs_info(m, n, 4, 2, 4, 4, 4, false, true, true, false, false); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } + else + { + if(r_mk <= 17.3926f) + { + if(workload <= 542.4000f) + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } + else + { + if(r_nk <= 0.5463f) + { + if(workload <= 11767.6001f) + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } + } +} + +std::pair ClGemmDefaultConfigReshapedBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + + const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; + + if(workload <= 323.4000f) + { + return configure_lhs_rhs_info(m, n, 2, 2, 8, 4, 8, false, false, false, true, false); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 8, 4, 2, 2, true, true, true, false, false); + } +} + +std::pair ClGemmDefaultConfigReshapedBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + // Get lhs_info/rhs_info in case of OpenCL buffer + if(n <= 4) + { + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true); + } + else + { + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 2, 8, 16, false, false, false, true); + } + + // Get lhs_info/rhs_info in case of OpenCL image + // Condition on the GPU workload + if((m / 4) * (n / 4) >= 2560) + { + // Big workload + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 8, true, true, true, false, true); + } + else + { + // Small workload + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 1, true, true, true, false, true); + } + + const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32); + const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img); + const TensorInfo tensor_reshaped_info(shape, 1, DataType::F32); + + // In case of vector by matrix with few work-items, we use the OpenCL buffer rather than the OpenCL image2d + const bool use_cl_image2d = (n <= 4) ? false : true; + + if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d) + { + return std::make_pair(lhs_info_img, rhs_info_img); + } + else + { + return std::make_pair(lhs_info_buf, rhs_info_buf); + } +} + +std::pair ClGemmDefaultConfigReshapedBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; + const float r_mk = static_cast(m) / static_cast(k); + + if(workload <= 1595.2000f) + { + if(r_mk <= 2.1044f) + { + if(workload <= 870.4000f) + { + return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 2, true, false, true, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 2, 4, 2, 2, false, false, true, false, false); + } + } + else + { + return configure_lhs_rhs_info(m, n, 4, 2, 4, 2, 2, false, false, true, false, false); + } + } + else + { + return configure_lhs_rhs_info(m, n, 4, 8, 4, 4, 2, true, true, true, false, false); + } +} + +std::pair ClGemmDefaultConfigReshapedBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(n <= 4) + { + return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, false, false, false, true); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, false, true, false, true); + } +} +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h new file mode 100644 index 0000000000..52e6ce3f48 --- /dev/null +++ b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_BIFROST_H +#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_BIFROST_H + +#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** Bifrost based OpenCL GEMMReshaped configuration */ +class ClGemmDefaultConfigReshapedBifrost final : public IClGemmKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu); + + // Inherited overridden method + std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + +private: + std::pair configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); +}; +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_BIFROST_H */ diff --git a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp new file mode 100644 index 0000000000..57e42c92b3 --- /dev/null +++ b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp @@ -0,0 +1,538 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +ClGemmDefaultConfigReshapedValhall::ClGemmDefaultConfigReshapedValhall(GPUTarget gpu) + : IClGemmKernelConfig(gpu) +{ +} + +std::pair ClGemmDefaultConfigReshapedValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +{ + using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigReshapedValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + + CLGEMMConfigArray configs_G77(&ClGemmDefaultConfigReshapedValhall::configure_G77_f32, + &ClGemmDefaultConfigReshapedValhall::configure_G77_f16, + &ClGemmDefaultConfigReshapedValhall::configure_G77_u8); + + CLGEMMConfigArray configs_G78(&ClGemmDefaultConfigReshapedValhall::configure_G78_f32, + &ClGemmDefaultConfigReshapedValhall::configure_G78_f16, + &ClGemmDefaultConfigReshapedValhall::configure_G77_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + + switch(_target) + { + case GPUTarget::G78: + func = configs_G78.get_function(data_type); + break; + case GPUTarget::G77: + default: + func = configs_G77.get_function(data_type); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); + return (this->*func)(m, n, k, b); +} + +std::pair ClGemmDefaultConfigReshapedValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(n <= 4) + { + return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, 1, 0, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 5, 4, 4, 2, 16, 0, 1, 0, 1); + } +} + +std::pair ClGemmDefaultConfigReshapedValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + const float r_mn = static_cast(m) / static_cast(n); + const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; + const float r_mk = static_cast(m) / static_cast(k); + const float r_nk = static_cast(n) / static_cast(k); + + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 0); + + if(r_mk <= 0.11824845522642136) + { + if(workload <= 880.0) + { + return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0); + } + else + { + if(r_nk <= 0.42521367967128754) + { + if(workload <= 1726.4000244140625) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 0); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + } + else + { + if(workload <= 1241.6000366210938) + { + return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 0); + } + } + } + } + else + { + if(workload <= 11404.7998046875) + { + if(r_mk <= 1.0126488208770752) + { + if(r_mn <= 2.545312523841858) + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0); + } + } + else + { + if(workload <= 2881.199951171875) + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, 0, 0, 1, 0, 1); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + } + } + else + { + if(r_nk <= 0.5765306055545807) + { + if(r_mn <= 6.010416746139526) + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + } + } +} + +std::pair ClGemmDefaultConfigReshapedValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + const float r_mn = static_cast(m) / static_cast(n); + const float r_mk = static_cast(m) / static_cast(k); + const float r_nk = static_cast(n) / static_cast(k); + const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; + + if(workload <= 1288.0000f) + { + if(workload <= 505.6000f) + { + if(r_mn <= 0.4466f) + { + if(r_nk <= 0.2384f) + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 2, 4, 2, 2, 0, 0, 1, 0, 0); + } + } + else + { + return configure_lhs_rhs_info(m, n, 2, 2, 4, 2, 2, 0, 0, 1, 0, 0); + } + } + else + { + if(r_mn <= 0.2250f) + { + if(r_mn <= 0.1599f) + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + } + else + { + if(r_mk <= 0.7609f) + { + if(r_mn <= 2.5453f) + { + if(workload <= 1089.6000f) + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 2, 4, 0, 0, 1, 0, 1); + } + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 16, 4, 4, 0, 0, 1, 0, 1); + } + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); + } + } + } + } + else + { + if(workload <= 5434.4001f) + { + if(workload <= 1603.2000f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + else + { + if(r_nk <= 0.6192f) + { + if(r_mn <= 16.1016f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + else + { + if(workload <= 2750.0000f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + else + { + if(r_mk <= 6.3151f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + } + } + } + else + { + if(r_mk <= 0.0387f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1); + } + else + { + if(r_mk <= 2.5859f) + { + if(r_mk <= 0.2734f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + } + } + } + } + else + { + if(r_mk <= 25.7500f) + { + if(r_mk <= 0.3615f) + { + if(r_mn <= 0.0913f) + { + if(r_mk <= 0.0683f) + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); + } + } + else + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + } + else + { + if(workload <= 11174.3999f) + { + if(r_mk <= 0.8047f) + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + else + { + if(workload <= 7185.5999f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1); + } + } + } + else + { + if(workload <= 17917.5000f) + { + if(r_mk <= 1.5078f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1); + } + } + else + { + if(workload <= 34449.6016f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 4, 0, 0, 1, 0, 1); + } + } + } + } + } + else + { + if(r_mk <= 331.1111f) + { + if(workload <= 53397.5996f) + { + if(r_mn <= 57.8063f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1); + } + } + else + { + if(r_nk <= 0.9211f) + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1); + } + } + } + else + { + if(workload <= 38070.4004f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + } + } + } + } +} + +std::pair ClGemmDefaultConfigReshapedValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + const float r_mn = static_cast(m) / static_cast(n); + const float r_nk = static_cast(n) / static_cast(k); + const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; + + if(workload <= 801.6000f) + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1); + } + else + { + if(r_mn <= 0.1211f) + { + if(workload <= 3296.0000f) + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + else + { + if(r_nk <= 1.0625f) + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 4, 0, 0, 1, 0, 1); + } + } + } + else + { + if(workload <= 5068.8000f) + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1); + } + else + { + if(r_nk <= 0.2361f) + { + if(workload <= 12630.0000f) + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 1, 0, 0, 1, 0, 1); + } + } + else + { + if(workload <= 178790.3984f) + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1); + } + } + } + } + } +} + +std::pair ClGemmDefaultConfigReshapedValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(n <= 4) + { + return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, 0, 0, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, 0, 1, 0, 1); + } +} +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h new file mode 100644 index 0000000000..588cd64e0e --- /dev/null +++ b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_VALHALL_H +#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_VALHALL_H + +#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** Valhall based OpenCL GEMMReshaped configuration */ +class ClGemmDefaultConfigReshapedValhall final : public IClGemmKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClGemmDefaultConfigReshapedValhall(GPUTarget gpu); + + // Inherited overridden method + std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + +private: + std::pair configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); +}; +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_VALHALL_H */ diff --git a/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h new file mode 100644 index 0000000000..c990c89a91 --- /dev/null +++ b/src/core/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_RESHAPED_KERNEL_CONFIGURATION_H +#define ARM_COMPUTE_CL_GEMM_RESHAPED_KERNEL_CONFIGURATION_H + +#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" +#include "src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h" +#include "src/core/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h" + +#include + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** CLGEMMReshaped factory class */ +class ClGemmReshapedKernelConfigurationFactory final +{ +public: + /** Static method to call the CLGEMMReshaped kernel configuration class accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return CLGEMMReshaped kernel configuration class + */ + static std::unique_ptr create(GPUTarget gpu) + { + switch(get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + case GPUTarget::BIFROST: + return std::make_unique(gpu); + case GPUTarget::VALHALL: + return std::make_unique(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_RESHAPED_KERNEL_CONFIGURATION_H */ diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp new file mode 100644 index 0000000000..7ed6b39f3e --- /dev/null +++ b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp @@ -0,0 +1,518 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +using namespace arm_compute::misc::shape_calculator; + +ClGemmDefaultConfigReshapedRhsOnlyBifrost::ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu) + : IClGemmKernelConfig(gpu) +{ +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +{ + using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k, + unsigned int b); + + CLGEMMConfigArray configs_G51(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8); + + CLGEMMConfigArray configs_G52(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8); + + CLGEMMConfigArray configs_G76(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8); + + CLGEMMConfigArray configs_G7x(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + + switch(_target) + { + case GPUTarget::G76: + func = configs_G76.get_function(data_type); + break; + case GPUTarget::G51: + func = configs_G51.get_function(data_type); + break; + case GPUTarget::G52: + func = configs_G52.get_function(data_type); + break; + default: + func = configs_G7x.get_function(data_type); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); + return (this->*func)(m, n, k, b); +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + if(n <= 2548) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, false, true, false, true, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 8, false, true, false, true, false); + } + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 4, false, true, false, true); + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + const bool is_workload_big = ((m * n * b) / 16) >= 2048; + + if(m == 1) + { + if(n >= 8192) + { + const unsigned int h0 = std::max(n / 4, 1U); + return configure_lhs_rhs_info(m, n, 1, 4, 8, 1, h0, false, true, false, true, false); + } + else + { + const unsigned int h0 = std::max(n / 2, 1U); + if(n <= 204) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true, false); + } + } + } + else + { + const int h0 = std::max(std::min(static_cast(n / 4), static_cast(16)), static_cast(1)); + if(is_workload_big) + { + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true); + } + else + { + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true); + } + } + + // Get lhs_info/rhs_info in case of OpenCL image + const int h0 = std::max(std::min(static_cast(n / 4), static_cast(16)), static_cast(1)); + if(is_workload_big) + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true); + } + + const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32); + const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img); + const TensorInfo tensor_reshaped_info(shape, 1, DataType::F32); + + // In case of vector by matrix or small workloads, we use the OpenCL buffer rather than the OpenCL image2d + const bool use_cl_image2d = ((m == 1) || ((((m * n * b) / 16) < 2048) && n < 128)) ? false : true; + + if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d) + { + return std::make_pair(lhs_info_img, rhs_info_img); + } + else + { + return std::make_pair(lhs_info_buf, rhs_info_buf); + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; + const float r_nk = static_cast(n) / static_cast(k); + + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + if(m == 1) + { + if(r_nk <= 0.4664f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 16, false, true, false, true, false); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } + else + { + if(workload <= 274.4000f) + { + return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 16, false, false, false, true, false); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + const unsigned int n0 = n < 1280 ? 2 : 4; + const unsigned int h0 = std::max(n / n0, 1U); + return configure_lhs_rhs_info(m, n, 1, n0, 4, 1, h0, false, true, false, true); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true); + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + if(n > 2048) + { + const unsigned int h0 = std::max(n / 4, 1U); + return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true); + } + else + { + const unsigned int h0 = std::max(n / 2, 1U); + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true); + } + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 4, false, true, false, true); + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + const float r_mn = static_cast(m) / static_cast(n); + const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; + const float r_mk = static_cast(m) / static_cast(k); + const float r_nk = static_cast(n) / static_cast(k); + + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + if(m == 1) + { + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false); + + if(r_mk <= 0.0026f) + { + if(r_nk <= 0.4664f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true); + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + } + else + { + if(r_mk <= 0.0148f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true); + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + } + } + else + { + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false); + + if(workload <= 362.6000f) + { + return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false); + } + else + { + if(r_mn <= 22.6067f) + { + if(workload <= 708.8000f) + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true); + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + else + { + return configure_lhs_rhs_info(m, n, 5, 8, 2, 1, 16, false, false, false, false, false); + } + } + else + { + if(r_nk <= 0.0917f) + { + return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true); + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + } + } + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + + if(m == 1) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false); + } + else + { + const float r_mn = static_cast(m) / static_cast(n); + const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; + + if(workload <= 7449.60f) + { + if(workload <= 691.60f) + { + return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 8, false, false, false, false, false); + } + else + { + if(workload <= 4155.20f) + { + return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 5, 8, 2, 1, 32, false, false, false, false, false); + } + } + } + else + { + if(workload <= 16300.80f) + { + if(r_mn <= 44.56f) + { + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + else + { + return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); + } + } + else + { + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + } + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + const unsigned int n0 = n < 1280 ? 2 : 4; + const unsigned int h0 = std::max(n / n0, 1U); + return configure_lhs_rhs_info(m, n, 1, n0, 8, 1, h0, false, true, false, true); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true); + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(dot8_supported(CLKernelLibrary::get().get_device())) + { + if(m == 1) + { + const unsigned int h0 = std::max(n / 2, 1U); + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true); + } + else + { + const unsigned int h0 = std::max(n / 4, 1U); + return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, false, true, false, true); + } + } + else + { + const int h0 = std::max(std::min(static_cast(n / 2), static_cast(128)), static_cast(1)); + if(m == 1) + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, h0, false, true, false, true); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 2, 16, 1, h0, false, true, false, true); + } + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + const unsigned int h0 = std::max(n / 2, 1U); + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, 2, false, true, false, true); + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + const unsigned int h0 = std::max(n / 2, 1U); + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, false, true, false, true); + } + else + { + const unsigned int h0 = std::max(n / 2, 1U); + return configure_lhs_rhs_info(m, n, 4, 2, 16, 1, h0, false, true, false, true); + } +} + +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h new file mode 100644 index 0000000000..7b1a1fb04d --- /dev/null +++ b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_BIFROST_H +#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_BIFROST_H + +#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** Bifrost based OpenCL GEMMReshapedOnlyRHS configuration */ +class ClGemmDefaultConfigReshapedRhsOnlyBifrost final : public IClGemmKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu); + + // Inherited overridden method + std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + +private: + std::pair configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); +}; +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_BIFROST_H */ diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp new file mode 100644 index 0000000000..4c6e633896 --- /dev/null +++ b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp @@ -0,0 +1,570 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +using namespace arm_compute::misc::shape_calculator; + +ClGemmDefaultConfigReshapedRhsOnlyValhall::ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu) + : IClGemmKernelConfig(gpu) +{ +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +{ + using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k, + unsigned int b); + + CLGEMMConfigArray configs_G77(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); + + CLGEMMConfigArray configs_G78(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + + switch(_target) + { + case GPUTarget::G78: + func = configs_G78.get_function(data_type); + break; + case GPUTarget::G77: + default: + func = configs_G77.get_function(data_type); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); + return (this->*func)(m, n, k, b); +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + if(m == 1) + { + const float r_mn = static_cast(m) / static_cast(n); + const float r_mk = static_cast(m) / static_cast(k); + + if(r_mk <= 0.0064484127797186375) + { + if(r_mn <= 0.0028273810748942196) + { + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + const unsigned int h0 = std::max(n / 4, 1U); + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, 0, 1, 0, 0, 1); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, 0, 1, 0, 1, 0); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 8, 0, 1, 0, 0, 0); + } + } + else + { + if(r_mk <= 0.020312500186264515) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, 0, 1, 0, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, 0, 1, 0, 1, 0); + } + } + } + else + { + const float r_mn = static_cast(m) / static_cast(n); + const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; + const float r_mk = static_cast(m) / static_cast(k); + + if(workload <= 1999.2000122070312) + { + if(workload <= 747.1999816894531) + { + return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); + } + else + { + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 2, 0, 0, 0, 1, 1); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } + else + { + if(r_mn <= 0.03348214365541935) + { + if(r_mk <= 0.028125000186264515) + { + return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); + } + else + { + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 2, 0, 0, 0, 1, 1); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } + else + { + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, 0, 1, 0, 0, 1); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 1, 0, 1, 0); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + const unsigned int h0 = std::max(n / 2, 1U); + if(n <= 836.0) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, 0, 1, 0, 1, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, 0, 1, 0, 1, 0); + } + } + else if(m < 128) + { + const int h0 = std::max(std::min(static_cast(n / 4), static_cast(256)), static_cast(1)); + if(k >= 512) + { + return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, 0, 1, 0, 0); + } + } + else + { + const int h0 = std::max(std::min(static_cast(n / 4), static_cast(256)), static_cast(1)); + if(n >= 64) + { + return configure_lhs_rhs_info(m, n, 4, 8, 4, 1, h0, 0, 1, 0, 0); + } + else + { + if(k >= 512) + { + return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, 0, 1, 0, 0); + } + } + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + const unsigned int h0 = std::max(n / 2, 1U); + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1); + } + else + { + const int h0 = std::max(std::min(static_cast(n / 4), static_cast(256)), static_cast(1)); + if(m >= 28) + { + return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 1); + } + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + const float r_mn = static_cast(m) / static_cast(n); + const float r_mk = static_cast(m) / static_cast(k); + const float r_nk = static_cast(n) / static_cast(k); + const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; + + if(m == 1) + { + if(workload <= 278.7000f) + { + if(workload <= 7.5000f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); + } + else + { + if(r_mn <= 0.0031f) + { + if(workload <= 256.6000f) + { + if(workload <= 16.7500f) + { + if(r_nk <= 1.6671f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); + } + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); + } + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); + } + } + else + { + if(r_mk <= 0.0027f) + { + if(r_mk <= 0.0014f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); + } + else + { + if(workload <= 8.9500f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); + } + } + } + else + { + if(workload <= 14.1500f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); + } + else + { + if(r_mk <= 0.0041f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); + } + } + } + } + } + } + else + { + if(workload <= 363.7000f) + { + if(r_mk <= 0.0031f) + { + return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 32, 0, 1, 0, 1, 0); + } + } + else + { + return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0); + } + } + } + else + { + if(workload <= 1384.8000f) + { + if(workload <= 704.0000f) + { + return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 32, 0, 1, 0, 1, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 4, 0, 0, 0, 1, 1); + } + } + else + { + if(workload <= 16761.6006f) + { + if(r_mn <= 187.1250f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 0, 0, 1, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 4, 0, 0, 0, 1, 1); + } + } + else + { + if(r_mk <= 432.4630f) + { + return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 16, 0, 0, 0, 1, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 16, 0, 1, 0, 1, 1); + } + } + } + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + const float r_mn = static_cast(m) / static_cast(n); + const float r_mk = static_cast(m) / static_cast(k); + const float r_nk = static_cast(n) / static_cast(k); + const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; + + if(m == 1) + { + if(r_mn <= 0.0038f) + { + if(workload <= 353.9000f) + { + if(workload <= 278.7000f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0); + } + else + { + if(r_mk <= 0.0004f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0); + } + else + { + if(r_mk <= 0.0030f) + { + return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 8, 0, 1, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0); + } + } + } + } + else + { + if(r_nk <= 1.9384f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 8, 0, 1, 1, 0, 1); + } + } + } + else + { + if(r_nk <= 1.0368f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, 0, 0, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0); + } + } + } + else + { + if(workload <= 1422.4000f) + { + if(workload <= 704.0000f) + { + return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 32, 0, 0, 1, 0, 0); + } + else + { + if(workload <= 1197.6000f) + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 8, 0, 1, 1, 0, 1); + } + else + { + if(workload <= 1241.6000f) + { + return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 8, 0, 1, 1, 0, 1); + } + } + } + } + else + { + if(workload <= 2769.6000f) + { + if(workload <= 1846.4000f) + { + if(r_mn <= 2.4927f) + { + return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0); + } + } + else + { + if(r_mn <= 0.6261f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0); + } + else + { + if(r_mk <= 3.4453f) + { + if(r_mn <= 1.4135f) + { + return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0); + } + } + else + { + return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); + } + } + } + } + else + { + if(r_nk <= 0.0302f) + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 8, 0, 1, 1, 0, 1); + } + else + { + if(r_mk <= 181.3750f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0); + } + else + { + if(workload <= 28035.2002f) + { + return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); + } + else + { + if(r_mk <= 808.6667f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); + } + } + } + } + } + } + } +} +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h new file mode 100644 index 0000000000..6a11ddb748 --- /dev/null +++ b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_VALHALL_H +#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_VALHALL_H + +#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** Valhall based OpenCL GEMMReshapedOnlyRHS configuration */ +class ClGemmDefaultConfigReshapedRhsOnlyValhall final : public IClGemmKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu); + + // Inherited overridden method + std::pair configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + +private: + std::pair configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); +}; +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_VALHALL_H */ diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultReshapedRhsOnlyBifrost.cpp b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultReshapedRhsOnlyBifrost.cpp new file mode 100644 index 0000000000..7ed6b39f3e --- /dev/null +++ b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultReshapedRhsOnlyBifrost.cpp @@ -0,0 +1,518 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +using namespace arm_compute::misc::shape_calculator; + +ClGemmDefaultConfigReshapedRhsOnlyBifrost::ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu) + : IClGemmKernelConfig(gpu) +{ +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +{ + using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k, + unsigned int b); + + CLGEMMConfigArray configs_G51(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8); + + CLGEMMConfigArray configs_G52(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8); + + CLGEMMConfigArray configs_G76(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8); + + CLGEMMConfigArray configs_G7x(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + + switch(_target) + { + case GPUTarget::G76: + func = configs_G76.get_function(data_type); + break; + case GPUTarget::G51: + func = configs_G51.get_function(data_type); + break; + case GPUTarget::G52: + func = configs_G52.get_function(data_type); + break; + default: + func = configs_G7x.get_function(data_type); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); + return (this->*func)(m, n, k, b); +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + if(n <= 2548) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, false, true, false, true, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 8, false, true, false, true, false); + } + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 4, false, true, false, true); + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + const bool is_workload_big = ((m * n * b) / 16) >= 2048; + + if(m == 1) + { + if(n >= 8192) + { + const unsigned int h0 = std::max(n / 4, 1U); + return configure_lhs_rhs_info(m, n, 1, 4, 8, 1, h0, false, true, false, true, false); + } + else + { + const unsigned int h0 = std::max(n / 2, 1U); + if(n <= 204) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true, false); + } + } + } + else + { + const int h0 = std::max(std::min(static_cast(n / 4), static_cast(16)), static_cast(1)); + if(is_workload_big) + { + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true); + } + else + { + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true); + } + } + + // Get lhs_info/rhs_info in case of OpenCL image + const int h0 = std::max(std::min(static_cast(n / 4), static_cast(16)), static_cast(1)); + if(is_workload_big) + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true); + } + + const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32); + const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img); + const TensorInfo tensor_reshaped_info(shape, 1, DataType::F32); + + // In case of vector by matrix or small workloads, we use the OpenCL buffer rather than the OpenCL image2d + const bool use_cl_image2d = ((m == 1) || ((((m * n * b) / 16) < 2048) && n < 128)) ? false : true; + + if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d) + { + return std::make_pair(lhs_info_img, rhs_info_img); + } + else + { + return std::make_pair(lhs_info_buf, rhs_info_buf); + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; + const float r_nk = static_cast(n) / static_cast(k); + + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + if(m == 1) + { + if(r_nk <= 0.4664f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 16, false, true, false, true, false); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } + else + { + if(workload <= 274.4000f) + { + return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 16, false, false, false, true, false); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + const unsigned int n0 = n < 1280 ? 2 : 4; + const unsigned int h0 = std::max(n / n0, 1U); + return configure_lhs_rhs_info(m, n, 1, n0, 4, 1, h0, false, true, false, true); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true); + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + if(n > 2048) + { + const unsigned int h0 = std::max(n / 4, 1U); + return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true); + } + else + { + const unsigned int h0 = std::max(n / 2, 1U); + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true); + } + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 4, false, true, false, true); + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + const float r_mn = static_cast(m) / static_cast(n); + const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; + const float r_mk = static_cast(m) / static_cast(k); + const float r_nk = static_cast(n) / static_cast(k); + + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + if(m == 1) + { + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false); + + if(r_mk <= 0.0026f) + { + if(r_nk <= 0.4664f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true); + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + } + else + { + if(r_mk <= 0.0148f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true); + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + } + } + else + { + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false); + + if(workload <= 362.6000f) + { + return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false); + } + else + { + if(r_mn <= 22.6067f) + { + if(workload <= 708.8000f) + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true); + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + else + { + return configure_lhs_rhs_info(m, n, 5, 8, 2, 1, 16, false, false, false, false, false); + } + } + else + { + if(r_nk <= 0.0917f) + { + return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true); + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + } + } + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + + if(m == 1) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false); + } + else + { + const float r_mn = static_cast(m) / static_cast(n); + const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; + + if(workload <= 7449.60f) + { + if(workload <= 691.60f) + { + return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 8, false, false, false, false, false); + } + else + { + if(workload <= 4155.20f) + { + return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 5, 8, 2, 1, 32, false, false, false, false, false); + } + } + } + else + { + if(workload <= 16300.80f) + { + if(r_mn <= 44.56f) + { + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + else + { + return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); + } + } + else + { + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + } + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + const unsigned int n0 = n < 1280 ? 2 : 4; + const unsigned int h0 = std::max(n / n0, 1U); + return configure_lhs_rhs_info(m, n, 1, n0, 8, 1, h0, false, true, false, true); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true); + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(dot8_supported(CLKernelLibrary::get().get_device())) + { + if(m == 1) + { + const unsigned int h0 = std::max(n / 2, 1U); + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true); + } + else + { + const unsigned int h0 = std::max(n / 4, 1U); + return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, false, true, false, true); + } + } + else + { + const int h0 = std::max(std::min(static_cast(n / 2), static_cast(128)), static_cast(1)); + if(m == 1) + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, h0, false, true, false, true); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 2, 16, 1, h0, false, true, false, true); + } + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + const unsigned int h0 = std::max(n / 2, 1U); + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, 2, false, true, false, true); + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + const unsigned int h0 = std::max(n / 2, 1U); + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, false, true, false, true); + } + else + { + const unsigned int h0 = std::max(n / 2, 1U); + return configure_lhs_rhs_info(m, n, 4, 2, 16, 1, h0, false, true, false, true); + } +} + +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultReshapedRhsOnlyValhall.cpp b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultReshapedRhsOnlyValhall.cpp new file mode 100644 index 0000000000..4c6e633896 --- /dev/null +++ b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultReshapedRhsOnlyValhall.cpp @@ -0,0 +1,570 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +using namespace arm_compute::misc::shape_calculator; + +ClGemmDefaultConfigReshapedRhsOnlyValhall::ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu) + : IClGemmKernelConfig(gpu) +{ +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +{ + using ConfigurationFunctionExecutorPtr = std::pair (ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k, + unsigned int b); + + CLGEMMConfigArray configs_G77(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); + + CLGEMMConfigArray configs_G78(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + + switch(_target) + { + case GPUTarget::G78: + func = configs_G78.get_function(data_type); + break; + case GPUTarget::G77: + default: + func = configs_G77.get_function(data_type); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); + return (this->*func)(m, n, k, b); +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + if(m == 1) + { + const float r_mn = static_cast(m) / static_cast(n); + const float r_mk = static_cast(m) / static_cast(k); + + if(r_mk <= 0.0064484127797186375) + { + if(r_mn <= 0.0028273810748942196) + { + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + const unsigned int h0 = std::max(n / 4, 1U); + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, 0, 1, 0, 0, 1); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, 0, 1, 0, 1, 0); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 8, 0, 1, 0, 0, 0); + } + } + else + { + if(r_mk <= 0.020312500186264515) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, 0, 1, 0, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, 0, 1, 0, 1, 0); + } + } + } + else + { + const float r_mn = static_cast(m) / static_cast(n); + const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; + const float r_mk = static_cast(m) / static_cast(k); + + if(workload <= 1999.2000122070312) + { + if(workload <= 747.1999816894531) + { + return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); + } + else + { + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 2, 0, 0, 0, 1, 1); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } + else + { + if(r_mn <= 0.03348214365541935) + { + if(r_mk <= 0.028125000186264515) + { + return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); + } + else + { + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 2, 0, 0, 0, 1, 1); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } + else + { + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, 0, 1, 0, 0, 1); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 1, 0, 1, 0); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + const unsigned int h0 = std::max(n / 2, 1U); + if(n <= 836.0) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, 0, 1, 0, 1, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, 0, 1, 0, 1, 0); + } + } + else if(m < 128) + { + const int h0 = std::max(std::min(static_cast(n / 4), static_cast(256)), static_cast(1)); + if(k >= 512) + { + return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, 0, 1, 0, 0); + } + } + else + { + const int h0 = std::max(std::min(static_cast(n / 4), static_cast(256)), static_cast(1)); + if(n >= 64) + { + return configure_lhs_rhs_info(m, n, 4, 8, 4, 1, h0, 0, 1, 0, 0); + } + else + { + if(k >= 512) + { + return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, 0, 1, 0, 0); + } + } + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + const unsigned int h0 = std::max(n / 2, 1U); + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1); + } + else + { + const int h0 = std::max(std::min(static_cast(n / 4), static_cast(256)), static_cast(1)); + if(m >= 28) + { + return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 1); + } + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + const float r_mn = static_cast(m) / static_cast(n); + const float r_mk = static_cast(m) / static_cast(k); + const float r_nk = static_cast(n) / static_cast(k); + const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; + + if(m == 1) + { + if(workload <= 278.7000f) + { + if(workload <= 7.5000f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); + } + else + { + if(r_mn <= 0.0031f) + { + if(workload <= 256.6000f) + { + if(workload <= 16.7500f) + { + if(r_nk <= 1.6671f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); + } + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); + } + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); + } + } + else + { + if(r_mk <= 0.0027f) + { + if(r_mk <= 0.0014f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); + } + else + { + if(workload <= 8.9500f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); + } + } + } + else + { + if(workload <= 14.1500f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); + } + else + { + if(r_mk <= 0.0041f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); + } + } + } + } + } + } + else + { + if(workload <= 363.7000f) + { + if(r_mk <= 0.0031f) + { + return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 32, 0, 1, 0, 1, 0); + } + } + else + { + return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0); + } + } + } + else + { + if(workload <= 1384.8000f) + { + if(workload <= 704.0000f) + { + return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 32, 0, 1, 0, 1, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 4, 0, 0, 0, 1, 1); + } + } + else + { + if(workload <= 16761.6006f) + { + if(r_mn <= 187.1250f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 0, 0, 1, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 4, 0, 0, 0, 1, 1); + } + } + else + { + if(r_mk <= 432.4630f) + { + return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 16, 0, 0, 0, 1, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 16, 0, 1, 0, 1, 1); + } + } + } + } +} + +std::pair ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + const float r_mn = static_cast(m) / static_cast(n); + const float r_mk = static_cast(m) / static_cast(k); + const float r_nk = static_cast(n) / static_cast(k); + const float workload = (static_cast(m) * static_cast(n) * static_cast(b)) / 20.0f; + + if(m == 1) + { + if(r_mn <= 0.0038f) + { + if(workload <= 353.9000f) + { + if(workload <= 278.7000f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0); + } + else + { + if(r_mk <= 0.0004f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0); + } + else + { + if(r_mk <= 0.0030f) + { + return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 8, 0, 1, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0); + } + } + } + } + else + { + if(r_nk <= 1.9384f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 8, 0, 1, 1, 0, 1); + } + } + } + else + { + if(r_nk <= 1.0368f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, 0, 0, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0); + } + } + } + else + { + if(workload <= 1422.4000f) + { + if(workload <= 704.0000f) + { + return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 32, 0, 0, 1, 0, 0); + } + else + { + if(workload <= 1197.6000f) + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 8, 0, 1, 1, 0, 1); + } + else + { + if(workload <= 1241.6000f) + { + return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 8, 0, 1, 1, 0, 1); + } + } + } + } + else + { + if(workload <= 2769.6000f) + { + if(workload <= 1846.4000f) + { + if(r_mn <= 2.4927f) + { + return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0); + } + } + else + { + if(r_mn <= 0.6261f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0); + } + else + { + if(r_mk <= 3.4453f) + { + if(r_mn <= 1.4135f) + { + return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0); + } + } + else + { + return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); + } + } + } + } + else + { + if(r_nk <= 0.0302f) + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 8, 0, 1, 1, 0, 1); + } + else + { + if(r_mk <= 181.3750f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0); + } + else + { + if(workload <= 28035.2002f) + { + return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); + } + else + { + if(r_mk <= 808.6667f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); + } + } + } + } + } + } + } +} +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h new file mode 100644 index 0000000000..8fd71276a0 --- /dev/null +++ b/src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_RESHAPED_ONLY_RHS_KERNEL_CONFIGURATION_H +#define ARM_COMPUTE_CL_GEMM_RESHAPED_ONLY_RHS_KERNEL_CONFIGURATION_H + +#include "src/core/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" +#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h" +#include "src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h" + +#include + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** CLGEMMReshapedOnlyRHS factory class */ +class ClGemmReshapedOnlyRhsKernelConfigurationFactory final +{ +public: + /** Static method to call the CLGEMMReshapedOnlyRHS kernel configuration class accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return CLGEMMReshapedOnlyRHS kernel configuration class + */ + static std::unique_ptr create(GPUTarget gpu) + { + switch(get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + case GPUTarget::BIFROST: + return std::make_unique(gpu); + case GPUTarget::VALHALL: + return std::make_unique(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_RESHAPED_ONLY_RHS_KERNEL_CONFIGURATION_H */ -- cgit v1.2.1