From 1ed6a144b1396297b813457016d545af1bb9d823 Mon Sep 17 00:00:00 2001 From: Jakub Sujak Date: Thu, 13 Apr 2023 21:14:42 +0100 Subject: Align naming convention of ClMatMul Ensure naming of MatMul on GPU conforms to the naming convention i.e. ClMatMul operator with the backend ClMatMulNativeKernel. Resolves: COMPMID-6015 Change-Id: I021d235b023ad17fe97bd6913e6a50d0ba4b194e Signed-off-by: Jakub Sujak Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9443 Reviewed-by: Gunes Bayir Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins Tested-by: Arm Jenkins --- Android.bp | 2 +- filelist.json | 2 +- src/gpu/cl/kernels/ClMatMulNativeKernel.cpp | 252 ++++++++++++++++++++++++ src/gpu/cl/kernels/ClMatMulNativeKernel.h | 71 +++++++ src/gpu/cl/kernels/ClNativeMatMulKernel.cpp | 252 ------------------------ src/gpu/cl/kernels/ClNativeMatMulKernel.h | 71 ------- src/gpu/cl/operators/ClMatMul.cpp | 6 +- src/gpu/cl/operators/ClMatMul.h | 6 +- tests/validation/CL/MatMulKernel.cpp | 10 +- tests/validation/fixtures/MatMulKernelFixture.h | 4 +- 10 files changed, 338 insertions(+), 338 deletions(-) create mode 100644 src/gpu/cl/kernels/ClMatMulNativeKernel.cpp create mode 100644 src/gpu/cl/kernels/ClMatMulNativeKernel.h delete mode 100644 src/gpu/cl/kernels/ClNativeMatMulKernel.cpp delete mode 100644 src/gpu/cl/kernels/ClNativeMatMulKernel.h diff --git a/Android.bp b/Android.bp index e38ea65d55..4bd307447b 100644 --- a/Android.bp +++ b/Android.bp @@ -695,8 +695,8 @@ cc_library_static { "src/gpu/cl/kernels/ClIm2ColKernel.cpp", "src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp", "src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp", + "src/gpu/cl/kernels/ClMatMulNativeKernel.cpp", "src/gpu/cl/kernels/ClMulKernel.cpp", - "src/gpu/cl/kernels/ClNativeMatMulKernel.cpp", "src/gpu/cl/kernels/ClPermuteKernel.cpp", "src/gpu/cl/kernels/ClPool2dKernel.cpp", "src/gpu/cl/kernels/ClPool3dKernel.cpp", diff --git a/filelist.json b/filelist.json index cf1c63b883..5418c2bfd0 100644 --- a/filelist.json +++ b/filelist.json @@ -512,7 +512,7 @@ "MatMul": { "files": { "common": [ - "src/gpu/cl/kernels/ClNativeMatMulKernel.cpp", + "src/gpu/cl/kernels/ClMatMulNativeKernel.cpp", "src/gpu/cl/operators/ClMatMul.cpp", "src/runtime/CL/functions/CLMatMul.cpp" ] diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp new file mode 100644 index 0000000000..47dba22e8f --- /dev/null +++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/common/utils/Log.h" +#include "src/core/CL/CLUtils.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info) +{ + const bool adj_lhs = matmul_kernel_info.adj_lhs; + const bool adj_rhs = matmul_kernel_info.adj_rhs; + const int m0 = matmul_kernel_info.m0; + const int n0 = matmul_kernel_info.n0; + const int k0 = matmul_kernel_info.k0; + + // Validate M0 + ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0"); + + if(adj_lhs) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed"); + } + + // Validate N0 + ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16), "Only 1,2,3,4,8,16 are supported for N0"); + + // Validate K0 + ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0"); + if(!adj_lhs || adj_rhs) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16), "Only 1,2,3,4,8,16 are supported for K0"); + } + + return Status{}; +} + +Status validate_input_shapes(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const MatMulKernelInfo &matmul_kernel_info) +{ + const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x(); + const size_t rhs_k = matmul_kernel_info.adj_rhs ? rhs_shape.x() : rhs_shape.y(); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_k != rhs_k, "K dimension in Lhs and Rhs matrices must match."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape.total_size() == 0, "Lhs tensor can't be empty"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_shape.total_size() == 0, "Rhs tensor can't be empty"); + + constexpr size_t batch_dim_start = 2; + for(size_t i = batch_dim_start; i < Coordinates::num_max_dimensions; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape[i] != rhs_shape[i], "Batch dimension broadcasting is not supported"); + } + + return Status{}; +} + +Status validate_export_to_cl_image(const ITensorInfo *rhs, const MatMulKernelInfo &matmul_kernel_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON(matmul_kernel_info.export_rhs_to_cl_image && rhs->lock_paddings()); + if(matmul_kernel_info.export_rhs_to_cl_image) + { + if(matmul_kernel_info.adj_rhs) + { + const int k0 = matmul_kernel_info.k0; + ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 != 4 && k0 != 8 && k0 != 16, "K0 can only be: 4, 8, and 16 for Rhs transposed"); + } + else + { + const int n0 = matmul_kernel_info.n0; + ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 != 4 && n0 != 8 && n0 != 16, "N0 can only be: 4, 8, and 16 for Rhs non-transposed"); + } + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(rhs), "Export to CLImage is not supported for this device/configuration"); + } + + return Status {}; +} +} +ClMatMulNativeKernel::ClMatMulNativeKernel() +{ + _type = CLKernelType::GEMM; +} +Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); + ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_export_to_cl_image(rhs, matmul_kernel_info)); + + if(output->total_size() != 0) + { + const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, output); + } + + return Status{}; +} +void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output, &compile_context, &matmul_kernel_info); + ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output, matmul_kernel_info); + ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, output, matmul_kernel_info)); + + // output tensor auto initialization if not yet initialized + auto_init_if_empty(*output, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); + + const int m = output->dimension(1); + const int n = output->dimension(0); + const int k = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x(); + const bool adj_lhs = matmul_kernel_info.adj_lhs; + + int m0 = adj_lhs ? adjust_vec_size(matmul_kernel_info.m0, m) : std::min(matmul_kernel_info.m0, m); + int n0 = adjust_vec_size(matmul_kernel_info.n0, n); + + _export_rhs_to_cl_image = matmul_kernel_info.export_rhs_to_cl_image && !rhs->lock_paddings(); + + // Configure kernel window + Window win = calculate_max_window(*output, Steps(n0, m0)); + win = win.collapse(win, Window::DimZ); + IClKernel::configure_internal(win); + + // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. + const unsigned int partial_store_m0 = m % m0; + const unsigned int partial_store_n0 = n % n0; + + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(lhs->data_type())); + build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); + build_opts.add_option("-DK0=" + support::cpp11::to_string(matmul_kernel_info.k0)); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); + build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); + build_opts.add_option("-DK=" + support::cpp11::to_string(k)); + build_opts.add_option_if_else(_export_rhs_to_cl_image, "-DRHS_TENSOR_TYPE=IMAGE", "-DRHS_TENSOR_TYPE=BUFFER"); + + std::string kernel_name("mat_mul_native"); + kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt"; + kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt"; + + // A macro guard to compile ONLY the kernel of interest + build_opts.add_option("-D" + upper_string(kernel_name)); + + if(_export_rhs_to_cl_image) + { + gemm::update_padding_for_cl_image(rhs); + } + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name; + _config_id += "_"; + _config_id += lower_string(string_from_data_type(lhs->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(m); + _config_id += "_"; + _config_id += support::cpp11::to_string(n); + _config_id += "_"; + _config_id += support::cpp11::to_string(k); + _config_id += "_"; + _config_id += support::cpp11::to_string(output->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(_export_rhs_to_cl_image); + _config_id += "_"; + _config_id += support::cpp11::to_string(m0); + _config_id += "_"; + _config_id += support::cpp11::to_string(n0); + _config_id += "_"; + _config_id += support::cpp11::to_string(matmul_kernel_info.k0); +} + +void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const ICLTensor *lhs = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const ICLTensor *rhs = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + ICLTensor *output = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output); + ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output); + + unsigned int idx = 0; + Window window_collapsed = window.collapse(ICLKernel::window(), Window::DimZ); + + add_3d_tensor_nhw_argument(idx, lhs); + + cl::Image2D rhs_cl_image; + if(_export_rhs_to_cl_image) + { + const size_t image_w = rhs->info()->dimension(0) / 4; + const size_t image_h = rhs->info()->tensor_shape().total_size() / rhs->info()->dimension(0); + const TensorShape shape2d(image_w, image_h); + const size_t image_row_pitch = rhs->info()->strides_in_bytes()[1]; + + // Export cl_buffer to cl_image + rhs_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), rhs->cl_buffer(), shape2d, rhs->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); + _kernel.setArg(idx++, rhs_cl_image); + } + + add_3d_tensor_nhw_argument(idx, rhs); + add_3d_tensor_nhw_argument(idx, output); + + enqueue(queue, *this, window_collapsed, lws_hint()); +} + +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.h b/src/gpu/cl/kernels/ClMatMulNativeKernel.h new file mode 100644 index 0000000000..50aa3b70e4 --- /dev/null +++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_GPU_CL_KERNELS_CLMATMULNATIVEKERNEL +#define ACL_SRC_GPU_CL_KERNELS_CLMATMULNATIVEKERNEL + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +class ClMatMulNativeKernel : public IClKernel +{ +public: + ClMatMulNativeKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMatMulNativeKernel); + /** Initialise the kernel's input and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in] lhs Input tensor for the LHS matrix. Data type supported: F32/F16. + * Dimensions above 2 are collapsed onto dimension 2 and represent the batch. + * @param[in] rhs Input tensor for the RHS matrix. Data type supported: same as @p lhs. + * Dimensions above 2 are collapsed onto dimension 2 and represent the batch. + * @param[out] output Output tensor info. Data type supported: same as @p lhs + * @param[in] matmul_info Attributes for Batch MatMul Kernel + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulKernelInfo &matmul_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClMatMulNativeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulKernelInfo &matmul_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + bool _export_rhs_to_cl_image{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ACL_SRC_GPU_CL_KERNELS_CLMATMULNATIVEKERNEL */ diff --git a/src/gpu/cl/kernels/ClNativeMatMulKernel.cpp b/src/gpu/cl/kernels/ClNativeMatMulKernel.cpp deleted file mode 100644 index c1f150d7aa..0000000000 --- a/src/gpu/cl/kernels/ClNativeMatMulKernel.cpp +++ /dev/null @@ -1,252 +0,0 @@ -/* - * Copyright (c) 2023 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "src/gpu/cl/kernels/ClNativeMatMulKernel.h" - -#include "arm_compute/core/CL/CLHelpers.h" -#include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/ITensorPack.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" - -#include "src/common/utils/Log.h" -#include "src/core/CL/CLUtils.h" -#include "src/core/helpers/AutoConfiguration.h" -#include "src/core/helpers/WindowHelpers.h" -#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" - -#include "support/Cast.h" -#include "support/StringSupport.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -namespace -{ -Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info) -{ - const bool adj_lhs = matmul_kernel_info.adj_lhs; - const bool adj_rhs = matmul_kernel_info.adj_rhs; - const int m0 = matmul_kernel_info.m0; - const int n0 = matmul_kernel_info.n0; - const int k0 = matmul_kernel_info.k0; - - // Validate M0 - ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0"); - - if(adj_lhs) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed"); - } - - // Validate N0 - ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16), "Only 1,2,3,4,8,16 are supported for N0"); - - // Validate K0 - ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0"); - if(!adj_lhs || adj_rhs) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16), "Only 1,2,3,4,8,16 are supported for K0"); - } - - return Status{}; -} - -Status validate_input_shapes(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const MatMulKernelInfo &matmul_kernel_info) -{ - const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x(); - const size_t rhs_k = matmul_kernel_info.adj_rhs ? rhs_shape.x() : rhs_shape.y(); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_k != rhs_k, "K dimension in Lhs and Rhs matrices must match."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape.total_size() == 0, "Lhs tensor can't be empty"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_shape.total_size() == 0, "Rhs tensor can't be empty"); - - constexpr size_t batch_dim_start = 2; - for(size_t i = batch_dim_start; i < Coordinates::num_max_dimensions; ++i) - { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape[i] != rhs_shape[i], "Batch dimension broadcasting is not supported"); - } - - return Status{}; -} - -Status validate_export_to_cl_image(const ITensorInfo *rhs, const MatMulKernelInfo &matmul_kernel_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON(matmul_kernel_info.export_rhs_to_cl_image && rhs->lock_paddings()); - if(matmul_kernel_info.export_rhs_to_cl_image) - { - if(matmul_kernel_info.adj_rhs) - { - const int k0 = matmul_kernel_info.k0; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 != 4 && k0 != 8 && k0 != 16, "K0 can only be: 4, 8, and 16 for Rhs transposed"); - } - else - { - const int n0 = matmul_kernel_info.n0; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 != 4 && n0 != 8 && n0 != 16, "N0 can only be: 4, 8, and 16 for Rhs non-transposed"); - } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(rhs), "Export to CLImage is not supported for this device/configuration"); - } - - return Status {}; -} -} -ClNativeMatMulKernel::ClNativeMatMulKernel() -{ - _type = CLKernelType::GEMM; -} -Status ClNativeMatMulKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info) -{ - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); - ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_export_to_cl_image(rhs, matmul_kernel_info)); - - if(output->total_size() != 0) - { - const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, output); - } - - return Status{}; -} -void ClNativeMatMulKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulKernelInfo &matmul_kernel_info) -{ - ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output, &compile_context, &matmul_kernel_info); - ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output, matmul_kernel_info); - ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, output, matmul_kernel_info)); - - // output tensor auto initialization if not yet initialized - auto_init_if_empty(*output, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); - - const int m = output->dimension(1); - const int n = output->dimension(0); - const int k = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x(); - const bool adj_lhs = matmul_kernel_info.adj_lhs; - - int m0 = adj_lhs ? adjust_vec_size(matmul_kernel_info.m0, m) : std::min(matmul_kernel_info.m0, m); - int n0 = adjust_vec_size(matmul_kernel_info.n0, n); - - _export_rhs_to_cl_image = matmul_kernel_info.export_rhs_to_cl_image && !rhs->lock_paddings(); - - // Configure kernel window - Window win = calculate_max_window(*output, Steps(n0, m0)); - win = win.collapse(win, Window::DimZ); - IClKernel::configure_internal(win); - - // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. - const unsigned int partial_store_m0 = m % m0; - const unsigned int partial_store_n0 = n % n0; - - CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(lhs->data_type())); - build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); - build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); - build_opts.add_option("-DK0=" + support::cpp11::to_string(matmul_kernel_info.k0)); - build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); - build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - build_opts.add_option("-DK=" + support::cpp11::to_string(k)); - build_opts.add_option_if_else(_export_rhs_to_cl_image, "-DRHS_TENSOR_TYPE=IMAGE", "-DRHS_TENSOR_TYPE=BUFFER"); - - std::string kernel_name("mat_mul_native"); - kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt"; - kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt"; - - // A macro guard to compile ONLY the kernel of interest - build_opts.add_option("-D" + upper_string(kernel_name)); - - if(_export_rhs_to_cl_image) - { - gemm::update_padding_for_cl_image(rhs); - } - - // Create kernel - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - - // Set config_id for enabling LWS tuning - _config_id = kernel_name; - _config_id += "_"; - _config_id += lower_string(string_from_data_type(lhs->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(m); - _config_id += "_"; - _config_id += support::cpp11::to_string(n); - _config_id += "_"; - _config_id += support::cpp11::to_string(k); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->dimension(2)); - _config_id += "_"; - _config_id += support::cpp11::to_string(_export_rhs_to_cl_image); - _config_id += "_"; - _config_id += support::cpp11::to_string(m0); - _config_id += "_"; - _config_id += support::cpp11::to_string(n0); - _config_id += "_"; - _config_id += support::cpp11::to_string(matmul_kernel_info.k0); -} - -void ClNativeMatMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) -{ - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - - const ICLTensor *lhs = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const ICLTensor *rhs = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - ICLTensor *output = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); - ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output); - ARM_COMPUTE_LOG_PARAMS(lhs, rhs, output); - - unsigned int idx = 0; - Window window_collapsed = window.collapse(ICLKernel::window(), Window::DimZ); - - add_3d_tensor_nhw_argument(idx, lhs); - - cl::Image2D rhs_cl_image; - if(_export_rhs_to_cl_image) - { - const size_t image_w = rhs->info()->dimension(0) / 4; - const size_t image_h = rhs->info()->tensor_shape().total_size() / rhs->info()->dimension(0); - const TensorShape shape2d(image_w, image_h); - const size_t image_row_pitch = rhs->info()->strides_in_bytes()[1]; - - // Export cl_buffer to cl_image - rhs_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), rhs->cl_buffer(), shape2d, rhs->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); - _kernel.setArg(idx++, rhs_cl_image); - } - - add_3d_tensor_nhw_argument(idx, rhs); - add_3d_tensor_nhw_argument(idx, output); - - enqueue(queue, *this, window_collapsed, lws_hint()); -} - -} // namespace kernels -} // namespace opencl -} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClNativeMatMulKernel.h b/src/gpu/cl/kernels/ClNativeMatMulKernel.h deleted file mode 100644 index 3d0f18ec84..0000000000 --- a/src/gpu/cl/kernels/ClNativeMatMulKernel.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2023 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef ACL_SRC_GPU_CL_KERNELS_CLNATIVEMATMULKERNEL -#define ACL_SRC_GPU_CL_KERNELS_CLNATIVEMATMULKERNEL - -#include "arm_compute/core/KernelDescriptors.h" -#include "src/core/common/Macros.h" -#include "src/gpu/cl/ClCompileContext.h" -#include "src/gpu/cl/IClKernel.h" - -namespace arm_compute -{ -namespace opencl -{ -namespace kernels -{ -class ClNativeMatMulKernel : public IClKernel -{ -public: - ClNativeMatMulKernel(); - ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClNativeMatMulKernel); - /** Initialise the kernel's input and output. - * - * @param[in] compile_context The compile context to be used. - * @param[in] lhs Input tensor for the LHS matrix. Data type supported: F32/F16. - * Dimensions above 2 are collapsed onto dimension 2 and represent the batch. - * @param[in] rhs Input tensor for the RHS matrix. Data type supported: same as @p lhs. - * Dimensions above 2 are collapsed onto dimension 2 and represent the batch. - * @param[out] output Output tensor info. Data type supported: same as @p lhs - * @param[in] matmul_info Attributes for Batch MatMul Kernel - */ - void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulKernelInfo &matmul_info); - /** Static function to check if given info will lead to a valid configuration - * - * Similar to @ref ClNativeMatMulKernel::configure() - * - * @return a status - */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulKernelInfo &matmul_info); - - // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; - -private: - bool _export_rhs_to_cl_image{ false }; -}; -} // namespace kernels -} // namespace opencl -} // namespace arm_compute -#endif /* ACL_SRC_GPU_CL_KERNELS_CLNATIVEMATMULKERNEL */ diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp index dadaa1f779..3ad6d914c7 100644 --- a/src/gpu/cl/operators/ClMatMul.cpp +++ b/src/gpu/cl/operators/ClMatMul.cpp @@ -25,7 +25,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/common/utils/Log.h" -#include "src/gpu/cl/kernels/ClNativeMatMulKernel.h" +#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" namespace arm_compute { @@ -33,7 +33,7 @@ namespace opencl { using namespace arm_compute::opencl::kernels; ClMatMul::ClMatMul() - : _native_matmul_kernel(std::make_unique()) + : _native_matmul_kernel(std::make_unique()) { } ClMatMul::~ClMatMul() @@ -44,7 +44,7 @@ Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulKernelInfo kernel_info; kernel_info.adj_lhs = matmul_info.adj_lhs(); kernel_info.adj_rhs = matmul_info.adj_rhs(); - return ClNativeMatMulKernel::validate(lhs, rhs, output, kernel_info); + return ClMatMulNativeKernel::validate(lhs, rhs, output, kernel_info); } void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *output, const MatMulInfo &matmul_info) { diff --git a/src/gpu/cl/operators/ClMatMul.h b/src/gpu/cl/operators/ClMatMul.h index 894b8d5816..20beda91ce 100644 --- a/src/gpu/cl/operators/ClMatMul.h +++ b/src/gpu/cl/operators/ClMatMul.h @@ -25,7 +25,7 @@ #define ARM_COMPUTE_SRC_GPU_CL_OPERATORS_ClMatMul #include "src/gpu/cl/IClOperator.h" -#include "src/gpu/cl/kernels/ClNativeMatMulKernel.h" +#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" #include namespace arm_compute @@ -34,7 +34,7 @@ namespace opencl { /** Basic operator to execute BatchMatMul on OpenCL. This operator calls the following OpenCL kernels: * - * -# @ref kernels::ClNativeMatMulKernel + * -# @ref kernels::ClMatMulNativeKernel */ class ClMatMul : public IClOperator { @@ -77,7 +77,7 @@ public: void run(ITensorPack &tensors) override; private: - std::unique_ptr _native_matmul_kernel; + std::unique_ptr _native_matmul_kernel; }; } // namespace opencl } // namespace arm_compute diff --git a/tests/validation/CL/MatMulKernel.cpp b/tests/validation/CL/MatMulKernel.cpp index 962542400e..9c19e42d04 100644 --- a/tests/validation/CL/MatMulKernel.cpp +++ b/tests/validation/CL/MatMulKernel.cpp @@ -23,7 +23,7 @@ */ #include "arm_compute/runtime/CL/CLTensor.h" -#include "src/gpu/cl/kernels/ClNativeMatMulKernel.h" +#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" #include "tests/datasets/LargeMatMulDataset.h" #include "tests/datasets/SmallMatMulDataset.h" #include "tests/framework/Macros.h" @@ -162,7 +162,7 @@ TEST_CASE(SupportedBlockSizes, framework::DatasetMode::ALL) for(auto &pair : supported_block_sizes) { TensorInfo output_info; - Status status = ClNativeMatMulKernel::validate(&lhs_info, &rhs_info, &output_info, pair.first); + Status status = ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, &output_info, pair.first); if(!pair.first.export_rhs_to_cl_image || export_to_cl_image_supported) { @@ -219,7 +219,7 @@ TEST_CASE(ExportToCLImage, framework::DatasetMode::ALL) const MatMulKernelInfo matmul_kernel_info {adj_lhs, adj_rhs, 4, 4, 4, true /* export_rhs_to_cl_image */}; TensorInfo output_info; - Status status = ClNativeMatMulKernel::validate(&lhs_info, &rhs_info, &output_info, matmul_kernel_info); + Status status = ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, &output_info, matmul_kernel_info); const bool expected = std::get<4>(tuple); ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS); @@ -276,7 +276,7 @@ TEST_CASE(ValidateInputShapes, framework::DatasetMode::ALL) MatMulKernelInfo matmul_kernel_info{ adj_lhs, adj_rhs, 1, 1, 1, false /* export_rhs_to_cl_image */ }; - Status status = ClNativeMatMulKernel::validate(&lhs_info, &rhs_info, &output_info, matmul_kernel_info); + Status status = ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, &output_info, matmul_kernel_info); ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS); } } @@ -319,7 +319,7 @@ TEST_CASE(ValidateDataTypes, framework::DatasetMode::ALL) const TensorInfo rhs_info(shape, 1, std::get<1>(tuple)); TensorInfo output_info(shape, 1, std::get<2>(tuple)); - Status status = ClNativeMatMulKernel::validate(&lhs_info, &rhs_info, &output_info, matmul_kernel_info); + Status status = ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, &output_info, matmul_kernel_info); ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS); } } diff --git a/tests/validation/fixtures/MatMulKernelFixture.h b/tests/validation/fixtures/MatMulKernelFixture.h index c131fea7fa..10e2a0659a 100644 --- a/tests/validation/fixtures/MatMulKernelFixture.h +++ b/tests/validation/fixtures/MatMulKernelFixture.h @@ -25,7 +25,7 @@ #define ACL_TESTS_VALIDATION_FIXTURES_MATMULKERNELFIXTURE #include "arm_compute/core/KernelDescriptors.h" -#include "src/gpu/cl/kernels/ClNativeMatMulKernel.h" +#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" #include "tests/CL/CLAccessor.h" #include "tests/CL/Helper.h" #include "tests/framework/Fixture.h" @@ -101,7 +101,7 @@ protected: CLTensor b = create_tensor(shape_b, data_type, 1); CLTensor dst = create_tensor(output_shape, data_type, 1); - CLSynthetizeOperator matMul{}; + CLSynthetizeOperator matMul{}; MatMulKernelInfo matmul_info; matmul_info.adj_lhs = pretranspose_a; matmul_info.adj_rhs = pretranspose_b; -- cgit v1.2.1