diff options
Diffstat (limited to 'src/gpu/cl/kernels/gemm')
20 files changed, 813 insertions, 756 deletions
diff --git a/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp index 9350bf74bb..b5ebac3b49 100644 --- a/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp +++ b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp @@ -39,14 +39,24 @@ namespace kernels { namespace gemm { -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, - bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, + unsigned int n, + unsigned int m0, + unsigned int n0, + unsigned int k0, + unsigned int v0, + unsigned int h0, + bool lhs_interleave, + bool rhs_interleave, + bool lhs_transpose, + bool rhs_transpose, + bool export_to_cl_image) { ARM_COMPUTE_ERROR_ON(m0 == 0 || n0 == 0); ARM_COMPUTE_ERROR_ON(v0 == 0); v0 = std::max(std::min(static_cast<int>(m / m0), static_cast<int>(v0)), static_cast<int>(1)); - if(h0 == 0) + if (h0 == 0) { // When h0 is 0, we should take the maximum H0 possible h0 = std::max(n / n0, 1U); @@ -62,17 +72,22 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned return std::make_pair(lhs_info, rhs_info); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img, - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf, - unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img, + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf, + unsigned int n, + unsigned int k, + unsigned int b, + DataType data_type) { - ARM_COMPUTE_ERROR_ON_MSG(info_buf.second.export_to_cl_image == true, "The fallback GeMM configuration cannot have export_to_cl_image = true"); + ARM_COMPUTE_ERROR_ON_MSG(info_buf.second.export_to_cl_image == true, + "The fallback GeMM configuration cannot have export_to_cl_image = true"); const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, data_type); const TensorShape shape = misc::shape_calculator::compute_rhs_reshaped_shape(tensor_rhs_info, info_img.second); const TensorInfo tensor_reshaped_info(shape, 1, data_type); - if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second))) + if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second))) { return info_img; } @@ -90,42 +105,56 @@ void update_padding_for_cl_image(ITensorInfo *tensor) const unsigned int pixel_alignment = get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()); ARM_COMPUTE_ERROR_ON_MSG(pixel_alignment == 0, "Cannot retrieve cl_image pitch alignment"); - if(pixel_alignment == 0) + if (pixel_alignment == 0) { return; } const unsigned int row_pitch_alignment = pixel_alignment * num_floats_per_pixel; - const unsigned int round_up_width = ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment; - const unsigned int padding = round_up_width - stride_y_in_elements; + const unsigned int round_up_width = + ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment; + const unsigned int padding = round_up_width - stride_y_in_elements; tensor->extend_padding(PaddingSize(0, tensor->padding().right + padding, 0, 0)); } Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info) { - if(rhs_info.export_to_cl_image) + if (rhs_info.export_to_cl_image) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 == 2) || (rhs_info.n0 == 3)) && rhs_info.transpose == false, "Export to cl_image only supported with n0 = 4, 8 or 16"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 == 2) || (rhs_info.k0 == 3)) && rhs_info.transpose == true, "Export to cl_image only supported with k0 = 4, 8 or 16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 == 2) || (rhs_info.n0 == 3)) && rhs_info.transpose == false, + "Export to cl_image only supported with n0 = 4, 8 or 16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 == 2) || (rhs_info.k0 == 3)) && rhs_info.transpose == true, + "Export to cl_image only supported with k0 = 4, 8 or 16"); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(&tensor_reshaped_info, DataType::F32, DataType::F16); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, "Impossible to retrieve the cl_image pitch alignment"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), + "The extension cl_khr_image2d_from_buffer is not supported on the target platform"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, + "Impossible to retrieve the cl_image pitch alignment"); // Check the width and height of the output tensor. // Since we cannot create a 3d image from a buffer, the third dimension is collapsed on the second dimension const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>(); const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4, "Not supported width for cl_image"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h, "Not supported height for cl_image"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4, + "Not supported width for cl_image"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h, + "Not supported height for cl_image"); } return Status{}; } -bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const unsigned int k, const unsigned int b, - const DataType data_type, unsigned int &best_m0, unsigned int &best_n0) +bool is_mmul_kernel_preferred(const unsigned int m, + const unsigned int n, + const unsigned int k, + const unsigned int b, + const DataType data_type, + unsigned int &best_m0, + unsigned int &best_n0) { ARM_COMPUTE_UNUSED(n, k, b, data_type); @@ -141,7 +170,8 @@ bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const return ((k % mmul_k0) == 0) && (gws_y > 4); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b) { size_t min_acc = std::numeric_limits<size_t>::max(); size_t min_idx = 0; @@ -150,12 +180,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> find_lhs_rhs_info(const GeMMConf const size_t num_rows = configs.size(); const size_t num_cols = configs[0].size(); - ARM_COMPUTE_ERROR_ON_MSG(num_cols != 14U, "The entry should have 14 integer values representing: M, N, K, B, M0, N0. K0, V0, H0, INT_LHS, INT_RHS, TRA_LHS, TRA_RHS, IMG_RHS"); + ARM_COMPUTE_ERROR_ON_MSG(num_cols != 14U, "The entry should have 14 integer values representing: M, N, K, B, M0, " + "N0. K0, V0, H0, INT_LHS, INT_RHS, TRA_LHS, TRA_RHS, IMG_RHS"); ARM_COMPUTE_UNUSED(num_cols); // Find nearest GeMM workload // Note: the workload does not depend on the K dimension - for(size_t y = 0; y < num_rows; ++y) + for (size_t y = 0; y < num_rows; ++y) { size_t mc0 = static_cast<size_t>(configs[y][0]); size_t nc0 = static_cast<size_t>(configs[y][1]); @@ -168,7 +199,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> find_lhs_rhs_info(const GeMMConf acc += (k - kc0) * (k - kc0); acc += (b - bc0) * (b - bc0); acc = std::sqrt(acc); - if(acc < min_acc) + if (acc < min_acc) { min_acc = acc; min_idx = y; diff --git a/src/gpu/cl/kernels/gemm/ClGemmHelpers.h b/src/gpu/cl/kernels/gemm/ClGemmHelpers.h index 6689b10e69..84776fb207 100644 --- a/src/gpu/cl/kernels/gemm/ClGemmHelpers.h +++ b/src/gpu/cl/kernels/gemm/ClGemmHelpers.h @@ -54,8 +54,18 @@ using GeMMConfigsMatrix = std::vector<std::vector<int32_t>>; * * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo */ -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, - bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image = false); +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, + unsigned int n, + unsigned int m0, + unsigned int n0, + unsigned int k0, + unsigned int v0, + unsigned int h0, + bool lhs_interleave, + bool rhs_interleave, + bool lhs_transpose, + bool rhs_transpose, + bool export_to_cl_image = false); /** Select @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo * @@ -72,9 +82,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned * * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo */ -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img, - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf, - unsigned int n, unsigned int k, unsigned int b, DataType data_type); +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img, + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf, + unsigned int n, + unsigned int k, + unsigned int b, + DataType data_type); /** Update padding required to export the OpenCL buffer to OpenCL image2d * @@ -103,8 +117,13 @@ Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, * * @return true if MMUL kernel is preferred over kernels w/o MMUL, false otherwise */ -bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const unsigned int k, const unsigned int b, - const DataType data_type, unsigned int &best_m0, unsigned int &best_n0); +bool is_mmul_kernel_preferred(const unsigned int m, + const unsigned int n, + const unsigned int k, + const unsigned int b, + const DataType data_type, + unsigned int &best_m0, + unsigned int &best_n0); /** Find the preferred configurations for the LHS and RHS tensor using the GeMMConfigsMatrix provided by the user * @@ -116,7 +135,8 @@ bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const * * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo */ -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b); +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b); } // namespace gemm } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h b/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h index a49836cfda..9d08633963 100644 --- a/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h +++ b/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h @@ -26,6 +26,7 @@ #include "arm_compute/core/GPUTarget.h" #include "arm_compute/core/Types.h" + #include "src/core/common/Macros.h" #include <array> @@ -56,8 +57,7 @@ public: * @param[in] func_int8 Function to call for GEMM Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) * */ - CLGEMMConfigArray(T func_f32, T func_f16, T func_int8) - : _configs{ func_f32, func_f16, func_int8 } + CLGEMMConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8} { } @@ -69,7 +69,7 @@ public: */ T get_function(DataType data_type) { - switch(data_type) + switch (data_type) { case DataType::F32: return _configs.at(DT_F32); @@ -96,8 +96,7 @@ public: * * @param[in] arch GPU target */ - IClGemmKernelConfig(GPUTarget arch) - : _target(arch) + IClGemmKernelConfig(GPUTarget arch) : _target(arch) { } ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClGemmKernelConfig); @@ -111,7 +110,8 @@ public: * @param[in] b Batch size * @param[in] data_type Data type */ - virtual std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0; + virtual std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0; protected: GPUTarget _target; diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp index d74c7fac9b..2f37eef31f 100644 --- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/GPUTarget.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include <utility> @@ -38,31 +39,34 @@ namespace kernels { namespace gemm { -ClGemmDefaultConfigNativeBifrost::ClGemmDefaultConfigNativeBifrost(GPUTarget gpu) - : IClGemmKernelConfig(gpu) +ClGemmDefaultConfigNativeBifrost::ClGemmDefaultConfigNativeBifrost(GPUTarget gpu) : IClGemmKernelConfig(gpu) { } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) { - using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, - unsigned int b); + using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ( + ClGemmDefaultConfigNativeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(&ClGemmDefaultConfigNativeBifrost::configure_G71_f32, - &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, // We use the F32 heuristic - &ClGemmDefaultConfigNativeBifrost::configure_G71_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G71( + &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, + &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, // We use the F32 heuristic + &ClGemmDefaultConfigNativeBifrost::configure_G71_u8); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigNativeBifrost::configure_G76_f32, - &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, // We use the F32 heuristic - &ClGemmDefaultConfigNativeBifrost::configure_G76_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76( + &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, + &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, // We use the F32 heuristic + &ClGemmDefaultConfigNativeBifrost::configure_G76_u8); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigNativeBifrost::configure_default_f32, - &ClGemmDefaultConfigNativeBifrost::configure_default_f32, // We use the F32 heuristic - &ClGemmDefaultConfigNativeBifrost::configure_default_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x( + &ClGemmDefaultConfigNativeBifrost::configure_default_f32, + &ClGemmDefaultConfigNativeBifrost::configure_default_f32, // We use the F32 heuristic + &ClGemmDefaultConfigNativeBifrost::configure_default_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G76: func = configs_G76.get_function(data_type); @@ -79,18 +83,19 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost return (this->*func)(m, n, k, b); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigNativeBifrost::configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { - if(n < 2048) + if (n < 2048) { return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); } - else if(n >= 2048 && n < 8192) + else if (n >= 2048 && n < 8192) { return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false); } @@ -105,20 +110,21 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigNativeBifrost::configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(dot8_supported(CLKernelLibrary::get().get_device())) + if (dot8_supported(CLKernelLibrary::get().get_device())) { - if(m == 1) + if (m == 1) { - if(n < 2048) + if (n < 2048) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false); } - else if(n >= 2048 && n < 16384) + else if (n >= 2048 && n < 16384) { return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); } @@ -129,7 +135,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost } else { - if(m < 64) + if (m < 64) { return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false); } @@ -141,9 +147,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost } else { - if(m == 1) + if (m == 1) { - if(n < 8192) + if (n < 8192) { return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); } @@ -159,24 +165,25 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigNativeBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { - if(n > 4196) + if (n > 4196) { return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 1, false, false, false, false); } else { - if(k < 2048) + if (k < 2048) { return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 1, false, false, false, false); } - else if(k >= 2048 && k < 16384) + else if (k >= 2048 && k < 16384) { return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); } @@ -192,18 +199,19 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigNativeBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { - if(n < 2048) + if (n < 2048) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false); } - else if(n >= 2048 && n < 16384) + else if (n >= 2048 && n < 16384) { return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); } @@ -214,7 +222,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost } else { - if(m < 64) + if (m < 64) { return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false); } @@ -225,7 +233,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigNativeBifrost::configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); @@ -233,7 +242,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 1, false, false, false, false); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigNativeBifrost::configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); @@ -243,4 +253,4 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost } // namespace gemm } // namespace kernels } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h index 9af5dc4135..f822daae53 100644 --- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h @@ -45,15 +45,22 @@ public: ClGemmDefaultConfigNativeBifrost(GPUTarget gpu); // Inherited overridden method - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; private: - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); }; } // namespace gemm } // namespace kernels diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp index b9f36c7210..f87fb1b659 100644 --- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/GPUTarget.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include <utility> @@ -38,18 +39,17 @@ namespace kernels { namespace gemm { -ClGemmDefaultConfigNativeMidgard::ClGemmDefaultConfigNativeMidgard(GPUTarget gpu) - : IClGemmKernelConfig(gpu) +ClGemmDefaultConfigNativeMidgard::ClGemmDefaultConfigNativeMidgard(GPUTarget gpu) : IClGemmKernelConfig(gpu) { } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::configure( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) { - using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, - unsigned int b); + using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ( + ClGemmDefaultConfigNativeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(nullptr, - nullptr, + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(nullptr, nullptr, &ClGemmDefaultConfigNativeMidgard::default_q8); auto func = configs_default.get_function(data_type); @@ -57,7 +57,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard return (this->*func)(m, n, k, b); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigNativeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); @@ -70,4 +71,4 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard } // namespace gemm } // namespace kernels } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h index c055753c48..fa76c5dba7 100644 --- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h @@ -45,10 +45,12 @@ public: ClGemmDefaultConfigNativeMidgard(GPUTarget gpu); // Inherited overridden method - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; private: - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); }; } // namespace gemm } // namespace kernels diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp index 95a4d2bd69..97a1298b0a 100644 --- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/GPUTarget.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include <utility> @@ -38,37 +39,38 @@ namespace kernels { namespace gemm { -ClGemmDefaultConfigNativeValhall::ClGemmDefaultConfigNativeValhall(GPUTarget gpu) - : IClGemmKernelConfig(gpu) +ClGemmDefaultConfigNativeValhall::ClGemmDefaultConfigNativeValhall(GPUTarget gpu) : IClGemmKernelConfig(gpu) { } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) { - using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeValhall::*)(unsigned int m, unsigned int n, unsigned int k, - unsigned int b); + using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ( + ClGemmDefaultConfigNativeValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(&ClGemmDefaultConfigNativeValhall::configure_G77_f32, - &ClGemmDefaultConfigNativeValhall::configure_G77_f16, - &ClGemmDefaultConfigNativeValhall::configure_G77_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default( + &ClGemmDefaultConfigNativeValhall::configure_G77_f32, &ClGemmDefaultConfigNativeValhall::configure_G77_f16, + &ClGemmDefaultConfigNativeValhall::configure_G77_u8); auto func = configs_default.get_function(data_type); ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); return (this->*func)(m, n, k, b); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigNativeValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { - if(n < 2048) + if (n < 2048) { return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); } - else if(n >= 2048 && n < 8192) + else if (n >= 2048 && n < 8192) { return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false); } @@ -83,18 +85,19 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigNativeValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { - if(n < 2048) + if (n < 2048) { return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); } - else if(n >= 2048 && n < 8192) + else if (n >= 2048 && n < 8192) { return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false); } @@ -109,20 +112,21 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigNativeValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(dot8_supported(CLKernelLibrary::get().get_device())) + if (dot8_supported(CLKernelLibrary::get().get_device())) { - if(m == 1) + if (m == 1) { - if(n < 2048) + if (n < 2048) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false); } - else if(n >= 2048 && n < 16384) + else if (n >= 2048 && n < 16384) { return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); } @@ -133,7 +137,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall } else { - if(m < 64) + if (m < 64) { return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false); } @@ -145,9 +149,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall } else { - if(m == 1) + if (m == 1) { - if(n < 8192) + if (n < 8192) { return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); } @@ -165,4 +169,4 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall } // namespace gemm } // namespace kernels } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h index f0f812fd46..c91b095279 100644 --- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h @@ -45,12 +45,16 @@ public: ClGemmDefaultConfigNativeValhall(GPUTarget gpu); // Inherited overridden method - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; private: - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); }; } // namespace gemm } // namespace kernels diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h index cf8412830b..955bb3c01a 100644 --- a/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h +++ b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h @@ -51,7 +51,7 @@ public: */ static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu) { - switch(get_arch_from_target(gpu)) + switch (get_arch_from_target(gpu)) { case GPUTarget::MIDGARD: return std::make_unique<ClGemmDefaultConfigNativeMidgard>(gpu); diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp index 657018eb53..c956c347ef 100644 --- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp +++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include <utility> @@ -43,30 +44,31 @@ namespace gemm { using namespace arm_compute::misc::shape_calculator; -ClGemmDefaultConfigReshapedBifrost::ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu) - : IClGemmKernelConfig(gpu) +ClGemmDefaultConfigReshapedBifrost::ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu) : IClGemmKernelConfig(gpu) { } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) { - using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ( + ClGemmDefaultConfigReshapedBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32, - &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16, - &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x( + &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16, + &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(&ClGemmDefaultConfigReshapedBifrost::configure_G52_f32, - &ClGemmDefaultConfigReshapedBifrost::configure_G52_f16, - &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52( + &ClGemmDefaultConfigReshapedBifrost::configure_G52_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G52_f16, + &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigReshapedBifrost::configure_G76_f32, - &ClGemmDefaultConfigReshapedBifrost::configure_G76_f16, - &ClGemmDefaultConfigReshapedBifrost::configure_G76_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76( + &ClGemmDefaultConfigReshapedBifrost::configure_G76_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G76_f16, + &ClGemmDefaultConfigReshapedBifrost::configure_G76_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G76: func = configs_G76.get_function(data_type); @@ -83,12 +85,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro return (this->*func)(m, n, k, b); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(n <= 4) + if (n <= 4) { return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true); } @@ -98,12 +101,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(n <= 4) + if (n <= 4) { return configure_lhs_rhs_info(m, n, 4, 2, 8, 8, 2, true, true, true, false); } @@ -113,14 +117,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(dot8_supported(CLKernelLibrary::get().get_device())) + if (dot8_supported(CLKernelLibrary::get().get_device())) { - if(n <= 4) + if (n <= 4) { return configure_lhs_rhs_info(m, n, 4, 2, 16, 2, 2, true, false, false, true); } @@ -131,7 +136,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro } else { - if(n <= 4) + if (n <= 4) { return configure_lhs_rhs_info(m, n, 4, 2, 8, 2, 2, true, false, false, true); } @@ -142,7 +147,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float r_mn = static_cast<float>(m) / static_cast<float>(n); const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; @@ -154,100 +160,108 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro GEMMLHSMatrixInfo lhs_info_img; GEMMRHSMatrixInfo rhs_info_img; - if(workload <= 274.4000f) + if (workload <= 274.4000f) { - if(r_nk <= 0.7461f) + if (r_nk <= 0.7461f) { - if(r_mn <= 21.1667f) + if (r_mn <= 21.1667f) { return configure_lhs_rhs_info(m, n, 4, 2, 4, 4, 4, false, true, true, false, false); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } else { - if(r_mk <= 17.3926f) + if (r_mk <= 17.3926f) { - if(workload <= 542.4000f) + if (workload <= 542.4000f) { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } else { - if(r_nk <= 0.5463f) + if (r_nk <= 0.5463f) { - if(workload <= 11767.6001f) + if (workload <= 11767.6001f) { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; - if(workload <= 323.4000f) + if (workload <= 323.4000f) { return configure_lhs_rhs_info(m, n, 2, 2, 8, 4, 8, false, false, false, true, false); } @@ -257,7 +271,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); @@ -268,7 +283,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro GEMMRHSMatrixInfo rhs_info_img; // Get lhs_info/rhs_info in case of OpenCL buffer - if(n <= 4) + if (n <= 4) { std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true); } @@ -279,15 +294,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro // Get lhs_info/rhs_info in case of OpenCL image // Condition on the GPU workload - if((m / 4) * (n / 4) >= 2560) + if ((m / 4) * (n / 4) >= 2560) { // Big workload - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 8, true, true, true, false, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 8, true, true, true, false, true); } else { // Small workload - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 1, true, true, true, false, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 1, true, true, true, false, true); } const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32); @@ -297,7 +314,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro // In case of vector by matrix with few work-items, we use the OpenCL buffer rather than the OpenCL image2d const bool use_cl_image2d = (n <= 4) ? false : true; - if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d) + if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d) { return std::make_pair(lhs_info_img, rhs_info_img); } @@ -307,16 +324,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; const float r_mk = static_cast<float>(m) / static_cast<float>(k); - if(workload <= 1595.2000f) + if (workload <= 1595.2000f) { - if(r_mk <= 2.1044f) + if (r_mk <= 2.1044f) { - if(workload <= 870.4000f) + if (workload <= 870.4000f) { return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 2, true, false, true, false, false); } @@ -336,12 +354,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(n <= 4) + if (n <= 4) { return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, false, false, false, true); } diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h index d86d1ba0a7..9227ec2551 100644 --- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h +++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h @@ -45,17 +45,26 @@ public: ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu); // Inherited overridden method - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; private: - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); }; } // namespace gemm } // namespace kernels diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp index 58d0873b86..70b324eb5a 100644 --- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp +++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/GPUTarget.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include <utility> @@ -38,26 +39,27 @@ namespace kernels { namespace gemm { -ClGemmDefaultConfigReshapedValhall::ClGemmDefaultConfigReshapedValhall(GPUTarget gpu) - : IClGemmKernelConfig(gpu) +ClGemmDefaultConfigReshapedValhall::ClGemmDefaultConfigReshapedValhall(GPUTarget gpu) : IClGemmKernelConfig(gpu) { } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) { - using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ( + ClGemmDefaultConfigReshapedValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClGemmDefaultConfigReshapedValhall::configure_G77_f32, - &ClGemmDefaultConfigReshapedValhall::configure_G77_f16, - &ClGemmDefaultConfigReshapedValhall::configure_G77_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77( + &ClGemmDefaultConfigReshapedValhall::configure_G77_f32, &ClGemmDefaultConfigReshapedValhall::configure_G77_f16, + &ClGemmDefaultConfigReshapedValhall::configure_G77_u8); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClGemmDefaultConfigReshapedValhall::configure_G78_f32, - &ClGemmDefaultConfigReshapedValhall::configure_G78_f16, - &ClGemmDefaultConfigReshapedValhall::configure_G77_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78( + &ClGemmDefaultConfigReshapedValhall::configure_G78_f32, &ClGemmDefaultConfigReshapedValhall::configure_G78_f16, + &ClGemmDefaultConfigReshapedValhall::configure_G77_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G78: func = configs_G78.get_function(data_type); @@ -72,12 +74,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha return (this->*func)(m, n, k, b); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(n <= 4) + if (n <= 4) { return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, 1, 0, 0, 1); } @@ -87,7 +90,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); @@ -104,17 +108,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 0); - if(r_mk <= 0.11824845522642136) + if (r_mk <= 0.11824845522642136) { - if(workload <= 880.0) + if (workload <= 880.0) { return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0); } else { - if(r_nk <= 0.42521367967128754) + if (r_nk <= 0.42521367967128754) { - if(workload <= 1726.4000244140625) + if (workload <= 1726.4000244140625) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 0); } @@ -123,13 +127,12 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } else { - if(workload <= 1241.6000366210938) + if (workload <= 1241.6000366210938) { return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0); } @@ -142,17 +145,16 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(workload <= 11404.7998046875) + if (workload <= 11404.7998046875) { - if(r_mk <= 1.0126488208770752) + if (r_mk <= 1.0126488208770752) { - if(r_mn <= 2.545312523841858) + if (r_mn <= 2.545312523841858) { std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } else { @@ -161,43 +163,39 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(workload <= 2881.199951171875) + if (workload <= 2881.199951171875) { std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, 0, 0, 1, 0, 1); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } else { std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } } else { - if(r_nk <= 0.5765306055545807) + if (r_nk <= 0.5765306055545807) { - if(r_mn <= 6.010416746139526) + if (r_mn <= 6.010416746139526) { std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } else { std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } else @@ -205,27 +203,27 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float r_mn = static_cast<float>(m) / static_cast<float>(n); const float r_mk = static_cast<float>(m) / static_cast<float>(k); const float r_nk = static_cast<float>(n) / static_cast<float>(k); const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; - if(workload <= 1288.0000f) + if (workload <= 1288.0000f) { - if(workload <= 505.6000f) + if (workload <= 505.6000f) { - if(r_mn <= 0.4466f) + if (r_mn <= 0.4466f) { - if(r_nk <= 0.2384f) + if (r_nk <= 0.2384f) { return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); } @@ -241,9 +239,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(r_mn <= 0.2250f) + if (r_mn <= 0.2250f) { - if(r_mn <= 0.1599f) + if (r_mn <= 0.1599f) { return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); } @@ -254,11 +252,11 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(r_mk <= 0.7609f) + if (r_mk <= 0.7609f) { - if(r_mn <= 2.5453f) + if (r_mn <= 2.5453f) { - if(workload <= 1089.6000f) + if (workload <= 1089.6000f) { return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); } @@ -281,29 +279,29 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(workload <= 5434.4001f) + if (workload <= 5434.4001f) { - if(workload <= 1603.2000f) + if (workload <= 1603.2000f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); } else { - if(r_nk <= 0.6192f) + if (r_nk <= 0.6192f) { - if(r_mn <= 16.1016f) + if (r_mn <= 16.1016f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); } else { - if(workload <= 2750.0000f) + if (workload <= 2750.0000f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); } else { - if(r_mk <= 6.3151f) + if (r_mk <= 6.3151f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1); } @@ -316,15 +314,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(r_mk <= 0.0387f) + if (r_mk <= 0.0387f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1); } else { - if(r_mk <= 2.5859f) + if (r_mk <= 2.5859f) { - if(r_mk <= 0.2734f) + if (r_mk <= 0.2734f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1); } @@ -343,13 +341,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(r_mk <= 25.7500f) + if (r_mk <= 25.7500f) { - if(r_mk <= 0.3615f) + if (r_mk <= 0.3615f) { - if(r_mn <= 0.0913f) + if (r_mn <= 0.0913f) { - if(r_mk <= 0.0683f) + if (r_mk <= 0.0683f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1); } @@ -365,15 +363,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(workload <= 11174.3999f) + if (workload <= 11174.3999f) { - if(r_mk <= 0.8047f) + if (r_mk <= 0.8047f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); } else { - if(workload <= 7185.5999f) + if (workload <= 7185.5999f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1); } @@ -385,9 +383,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(workload <= 17917.5000f) + if (workload <= 17917.5000f) { - if(r_mk <= 1.5078f) + if (r_mk <= 1.5078f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); } @@ -398,7 +396,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(workload <= 34449.6016f) + if (workload <= 34449.6016f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); } @@ -412,11 +410,11 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(r_mk <= 331.1111f) + if (r_mk <= 331.1111f) { - if(workload <= 53397.5996f) + if (workload <= 53397.5996f) { - if(r_mn <= 57.8063f) + if (r_mn <= 57.8063f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); } @@ -427,7 +425,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(r_nk <= 0.9211f) + if (r_nk <= 0.9211f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1); } @@ -439,7 +437,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(workload <= 38070.4004f) + if (workload <= 38070.4004f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1); } @@ -453,27 +451,28 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float r_mn = static_cast<float>(m) / static_cast<float>(n); const float r_nk = static_cast<float>(n) / static_cast<float>(k); const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; - if(workload <= 801.6000f) + if (workload <= 801.6000f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1); } else { - if(r_mn <= 0.1211f) + if (r_mn <= 0.1211f) { - if(workload <= 3296.0000f) + if (workload <= 3296.0000f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); } else { - if(r_nk <= 1.0625f) + if (r_nk <= 1.0625f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); } @@ -485,15 +484,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(workload <= 5068.8000f) + if (workload <= 5068.8000f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1); } else { - if(r_nk <= 0.2361f) + if (r_nk <= 0.2361f) { - if(workload <= 12630.0000f) + if (workload <= 12630.0000f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1); } @@ -504,7 +503,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(workload <= 178790.3984f) + if (workload <= 178790.3984f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); } @@ -518,12 +517,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(n <= 4) + if (n <= 4) { return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, 0, 0, 0, 1); } diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h index 466eda00a6..5f62efb59e 100644 --- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h +++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h @@ -45,14 +45,20 @@ public: ClGemmDefaultConfigReshapedValhall(GPUTarget gpu); // Inherited overridden method - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; private: - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); }; } // namespace gemm } // namespace kernels diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h index 1c32f1358b..83928b3f4f 100644 --- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h +++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h @@ -50,7 +50,7 @@ public: */ static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu) { - switch(get_arch_from_target(gpu)) + switch (get_arch_from_target(gpu)) { case GPUTarget::MIDGARD: case GPUTarget::BIFROST: diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp index 9c23d9c998..c4825bfbeb 100644 --- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp +++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp @@ -29,7 +29,9 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" + #include <utility> namespace arm_compute @@ -47,33 +49,39 @@ ClGemmDefaultConfigReshapedRhsOnlyBifrost::ClGemmDefaultConfigReshapedRhsOnlyBif { } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) { - using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k, - unsigned int b); - - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G51(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8); - - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8); - - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G31(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8); - - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8); - - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8); + using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ( + ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G51( + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52( + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G31( + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76( + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x( + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G76: func = configs_G76.get_function(data_type); @@ -96,14 +104,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn return (this->*func)(m, n, k, b); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { - if(n <= 2548) + if (n <= 2548) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, false, true, false, true, false); } @@ -118,12 +127,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { const unsigned int h0 = std::max(n / 2, 1U); return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1); @@ -131,7 +141,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn else { const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1)); - if(m >= 28) + if (m >= 28) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, 0, 1, 0, 1); } @@ -142,7 +152,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); @@ -154,9 +165,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn const bool is_workload_big = ((m * n * b) / 16) >= 2048; - if(m == 1) + if (m == 1) { - if(n >= 8192) + if (n >= 8192) { const unsigned int h0 = std::max(n / 4, 1U); return configure_lhs_rhs_info(m, n, 1, 4, 8, 1, h0, false, true, false, true, false); @@ -164,7 +175,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn else { const unsigned int h0 = std::max(n / 2, 1U); - if(n <= 204) + if (n <= 204) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true, false); } @@ -177,25 +188,29 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn else { const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1)); - if(is_workload_big) + if (is_workload_big) { - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true); } else { - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true); } } // Get lhs_info/rhs_info in case of OpenCL image const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1)); - if(is_workload_big) + if (is_workload_big) { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true); } const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32); @@ -205,7 +220,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn // In case of vector by matrix or small workloads, we use the OpenCL buffer rather than the OpenCL image2d const bool use_cl_image2d = ((m == 1) || ((((m * n * b) / 16) < 2048) && n < 128)) ? false : true; - if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d) + if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d) { return std::make_pair(lhs_info_img, rhs_info_img); } @@ -215,7 +230,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; const float r_nk = static_cast<float>(n) / static_cast<float>(k); @@ -225,46 +241,49 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn GEMMLHSMatrixInfo lhs_info_img; GEMMRHSMatrixInfo rhs_info_img; - if(m == 1) + if (m == 1) { - if(r_nk <= 0.4664f) + if (r_nk <= 0.4664f) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 16, false, true, false, true, false); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } else { - if(workload <= 274.4000f) + if (workload <= 274.4000f) { return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 16, false, false, false, true, false); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { const unsigned int n0 = n < 1280 ? 2 : 4; const unsigned int h0 = std::max(n / n0, 1U); @@ -276,14 +295,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { - if(n > 2048) + if (n > 2048) { const unsigned int h0 = std::max(n / 4, 1U); return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true); @@ -300,7 +320,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float r_mn = static_cast<float>(m) / static_cast<float>(n); const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; @@ -312,57 +333,59 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn GEMMLHSMatrixInfo lhs_info_img; GEMMRHSMatrixInfo rhs_info_img; - if(m == 1) + if (m == 1) { - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false); - if(r_mk <= 0.0026f) + if (r_mk <= 0.0026f) { - if(r_nk <= 0.4664f) + if (r_nk <= 0.4664f) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } else { - if(r_mk <= 0.0148f) + if (r_mk <= 0.0148f) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } } else { - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false); - if(workload <= 362.6000f) + if (workload <= 362.6000f) { return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false); } else { - if(r_mn <= 22.6067f) + if (r_mn <= 22.6067f) { - if(workload <= 708.8000f) + if (workload <= 708.8000f) { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } else { @@ -371,27 +394,28 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(r_nk <= 0.0917f) + if (r_nk <= 0.0917f) { return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } } } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); - if(m == 1) + if (m == 1) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false); } @@ -400,15 +424,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn const float r_mn = static_cast<float>(m) / static_cast<float>(n); const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; - if(workload <= 7449.60f) + if (workload <= 7449.60f) { - if(workload <= 691.60f) + if (workload <= 691.60f) { return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 8, false, false, false, false, false); } else { - if(workload <= 4155.20f) + if (workload <= 4155.20f) { return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); } @@ -420,21 +444,22 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(workload <= 16300.80f) + if (workload <= 16300.80f) { - if(r_mn <= 44.56f) + if (r_mn <= 44.56f) { GEMMLHSMatrixInfo lhs_info_buf; GEMMRHSMatrixInfo rhs_info_buf; GEMMLHSMatrixInfo lhs_info_img; GEMMRHSMatrixInfo rhs_info_img; - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } else { @@ -448,23 +473,25 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn GEMMLHSMatrixInfo lhs_info_img; GEMMRHSMatrixInfo rhs_info_img; - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { const unsigned int n0 = n < 1280 ? 2 : 4; const unsigned int h0 = std::max(n / n0, 1U); @@ -476,14 +503,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(dot8_supported(CLKernelLibrary::get().get_device())) + if (dot8_supported(CLKernelLibrary::get().get_device())) { - if(m == 1) + if (m == 1) { const unsigned int h0 = std::max(n / 2, 1U); return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true); @@ -497,7 +525,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn else { const int h0 = std::max(std::min(static_cast<int>(n / 2), static_cast<int>(128)), static_cast<int>(1)); - if(m == 1) + if (m == 1) { return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, h0, false, true, false, true); } @@ -508,12 +536,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { const unsigned int h0 = std::max(n / 2, 1U); return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true); @@ -524,12 +553,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { const unsigned int h0 = std::max(n / 2, 1U); return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, false, true, false, true); diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h index 321cbb5250..77c0c8d500 100644 --- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h +++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h @@ -45,21 +45,34 @@ public: ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu); // Inherited overridden method - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; private: - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); }; } // namespace gemm } // namespace kernels diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp index d08bf84c72..da3e2ec912 100644 --- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp +++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp @@ -50,30 +50,35 @@ ClGemmDefaultConfigReshapedRhsOnlyValhall::ClGemmDefaultConfigReshapedRhsOnlyVal { } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) { - using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k, - unsigned int b); + using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ( + ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77( + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78( + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G710( + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G715(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G715( + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G78: func = configs_G78.get_function(data_type); @@ -96,29 +101,29 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn return (this->*func)(m, n, k, b); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { - if(m == 1) + if (m == 1) { const float r_mn = static_cast<float>(m) / static_cast<float>(n); const float r_mk = static_cast<float>(m) / static_cast<float>(k); - if(r_mk <= 0.0064484127797186375) + if (r_mk <= 0.0064484127797186375) { - if(r_mn <= 0.0028273810748942196) + if (r_mn <= 0.0028273810748942196) { GEMMLHSMatrixInfo lhs_info_buf; GEMMRHSMatrixInfo rhs_info_buf; GEMMLHSMatrixInfo lhs_info_img; GEMMRHSMatrixInfo rhs_info_img; - const unsigned int h0 = std::max(n / 4, 1U); + const unsigned int h0 = std::max(n / 4, 1U); std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, 0, 1, 0, 0, 1); std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, 0, 1, 0, 1, 0); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } else { @@ -127,7 +132,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(r_mk <= 0.020312500186264515) + if (r_mk <= 0.020312500186264515) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, 0, 1, 0, 0, 0); } @@ -143,9 +148,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; const float r_mk = static_cast<float>(m) / static_cast<float>(k); - if(workload <= 1999.2000122070312) + if (workload <= 1999.2000122070312) { - if(workload <= 747.1999816894531) + if (workload <= 747.1999816894531) { return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); } @@ -159,15 +164,14 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } else { - if(r_mn <= 0.03348214365541935) + if (r_mn <= 0.03348214365541935) { - if(r_mk <= 0.028125000186264515) + if (r_mk <= 0.028125000186264515) { return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); } @@ -181,8 +185,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } else @@ -195,168 +198,112 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 1, 0, 1, 0); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { - const GeMMConfigsMatrix configs_1nkb_best = - { - { 1, 8984, 640, 1, 1, 8, 8, 1, 0, 1, 1, 1, 1, 0 }, - { 1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 6512, 6404, 1, 1, 4, 8, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 5304, 640, 1, 1, 4, 4, 1, 0, 1, 0, 1, 1, 0 }, - { 1, 1352, 1520, 1, 1, 2, 8, 1, 0, 1, 1, 1, 1, 0 }, - { 1, 4096, 25088, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 } - }; - - const GeMMConfigsMatrix configs_mnkb_n_small_best = - { - { 102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0 }, - { 102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1 }, - { 16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1 }, - { 16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 1 } - }; - - const GeMMConfigsMatrix configs_mnkb_n_small_fallback = - { - { 102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0 }, - { 102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0 }, - { 16384, 4, 128, 1, 2, 2, 16, 1, 2, 1, 1, 1, 1, 0 }, - { 16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0 } - }; - - const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = - { - { 25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 }, - { 25584, 16, 68, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1 }, - { 369664, 32, 28, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1 }, - { 65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 23036, 56, 736, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - { 90968, 40, 600, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - { 8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - { 50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - { 12604, 60, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - { 29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 } - }; - - const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = - { - { 25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 }, - { 25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 0, 0 }, - { 369664, 32, 28, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0 }, - { 65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 23036, 56, 736, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 90968, 40, 600, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0 }, - { 50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0 }, - { 12604, 60, 160, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0 }, - { 29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 } - }; - - const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = - { - { 24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0 }, - { 49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 1 }, - { 49, 1024, 1024, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - }; - - const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = - { - { 24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0 }, - { 49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 49, 1024, 1024, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0 }, + const GeMMConfigsMatrix configs_1nkb_best = { + {1, 8984, 640, 1, 1, 8, 8, 1, 0, 1, 1, 1, 1, 0}, {1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}, + {1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}, {1, 6512, 6404, 1, 1, 4, 8, 1, 0, 1, 0, 1, 0, 0}, + {1, 5304, 640, 1, 1, 4, 4, 1, 0, 1, 0, 1, 1, 0}, {1, 1352, 1520, 1, 1, 2, 8, 1, 0, 1, 1, 1, 1, 0}, + {1, 4096, 25088, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}, {1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}}; + + const GeMMConfigsMatrix configs_mnkb_n_small_best = {{102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0}, + {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1}, + {16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1}, + {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 1}}; + + const GeMMConfigsMatrix configs_mnkb_n_small_fallback = {{102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0}, + {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0}, + {16384, 4, 128, 1, 2, 2, 16, 1, 2, 1, 1, 1, 1, 0}, + {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0}}; + + const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = { + {25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}, {25584, 16, 68, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1}, + {369664, 32, 28, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1}, {65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}, + {23036, 56, 736, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, {90968, 40, 600, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, + {8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}, + {16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, {12604, 60, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, + {29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0}, + {2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}}; + + const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = { + {25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}, {25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 0, 0}, + {369664, 32, 28, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0}, {65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}, + {23036, 56, 736, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {90968, 40, 600, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, + {8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0}, {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}, + {16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0}, {12604, 60, 160, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0}, + {29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0}, + {2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}}; + + const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = { + {24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0}, + {49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 1}, + {49, 1024, 1024, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, }; - const GeMMConfigsMatrix configs_mnkb_squared_best = - { - { 72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0 }, - { 268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0 }, - { 180, 420, 952, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - { 1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 196, 512, 512, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1 }, - { 24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 }, - { 24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 } + const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = { + {24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0}, + {49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, + {49, 1024, 1024, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0}, }; - const GeMMConfigsMatrix configs_mnkb_squared_fallback = - { - { 72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0 }, - { 268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0 }, - { 180, 420, 952, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 196, 512, 512, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0 }, - { 24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 }, - { 24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 } - }; - - const GeMMConfigsMatrix configs_mnkb_best_batched = - { - { 3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - { 688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 } - }; - - const GeMMConfigsMatrix configs_mnkb_fallback_batched = - { - { 3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 4096, 48, 32, 36, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 } - }; + const GeMMConfigsMatrix configs_mnkb_squared_best = { + {72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0}, {268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0}, + {180, 420, 952, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, {1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, + {272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1}, + {24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}, {24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}}; + + const GeMMConfigsMatrix configs_mnkb_squared_fallback = { + {72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0}, {268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0}, + {180, 420, 952, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, {1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, + {272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0}, + {24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}, {24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}}; + + const GeMMConfigsMatrix configs_mnkb_best_batched = { + {3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, + {688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, + {112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, + {1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}}; + + const GeMMConfigsMatrix configs_mnkb_fallback_batched = { + {3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {4096, 48, 32, 36, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, + {688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, + {112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, + {1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}}; const GeMMConfigsMatrix *configs_best_to_use = nullptr; const GeMMConfigsMatrix *configs_fallback_to_use = nullptr; - if(b == 1) + if (b == 1) { constexpr float ratio_m_gt_n = 10.f; constexpr float ratio_n_gt_m = 0.1f; constexpr unsigned int n_small_thr = 4; const float ratio = static_cast<float>(m) / static_cast<float>(n); - if(m == 1) + if (m == 1) { // We do not need fallback in this case, as we never use cl_image for the rhs tensor configs_best_to_use = &configs_1nkb_best; configs_fallback_to_use = &configs_1nkb_best; } - else if(n <= n_small_thr && ratio > ratio_m_gt_n) + else if (n <= n_small_thr && ratio > ratio_m_gt_n) { configs_best_to_use = &configs_mnkb_n_small_best; configs_fallback_to_use = &configs_mnkb_n_small_fallback; } - else if(ratio > ratio_m_gt_n) + else if (ratio > ratio_m_gt_n) { configs_best_to_use = &configs_mnkb_m_gt_n_best; configs_fallback_to_use = &configs_mnkb_m_gt_n_fallback; } - else if(ratio < ratio_n_gt_m) + else if (ratio < ratio_n_gt_m) { configs_best_to_use = &configs_mnkb_n_gt_m_best; configs_fallback_to_use = &configs_mnkb_n_gt_m_fallback; @@ -381,17 +328,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn std::tie(lhs_info0, rhs_info0) = find_lhs_rhs_info(*configs_best_to_use, m, n, k, b); std::tie(lhs_info1, rhs_info1) = find_lhs_rhs_info(*configs_fallback_to_use, m, n, k, b); - return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), - std::make_pair(lhs_info1, rhs_info1), - n, k, b, DataType::F16); + return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), std::make_pair(lhs_info1, rhs_info1), n, k, b, + DataType::F16); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { const unsigned int h0 = std::max(n / 2, 1U); return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1); @@ -399,7 +346,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn else { const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1)); - if(m >= 28) + if (m >= 28) { return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, 0, 1, 0, 1); } @@ -410,30 +357,31 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float r_mn = static_cast<float>(m) / static_cast<float>(n); const float r_mk = static_cast<float>(m) / static_cast<float>(k); const float r_nk = static_cast<float>(n) / static_cast<float>(k); const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; - if(m == 1) + if (m == 1) { - if(workload <= 278.7000f) + if (workload <= 278.7000f) { - if(workload <= 7.5000f) + if (workload <= 7.5000f) { return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); } else { - if(r_mn <= 0.0031f) + if (r_mn <= 0.0031f) { - if(workload <= 256.6000f) + if (workload <= 256.6000f) { - if(workload <= 16.7500f) + if (workload <= 16.7500f) { - if(r_nk <= 1.6671f) + if (r_nk <= 1.6671f) { return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); } @@ -454,15 +402,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(r_mk <= 0.0027f) + if (r_mk <= 0.0027f) { - if(r_mk <= 0.0014f) + if (r_mk <= 0.0014f) { return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); } else { - if(workload <= 8.9500f) + if (workload <= 8.9500f) { return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); } @@ -474,13 +422,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(workload <= 14.1500f) + if (workload <= 14.1500f) { return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); } else { - if(r_mk <= 0.0041f) + if (r_mk <= 0.0041f) { return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); } @@ -495,9 +443,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(workload <= 363.7000f) + if (workload <= 363.7000f) { - if(r_mk <= 0.0031f) + if (r_mk <= 0.0031f) { return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0); } @@ -514,9 +462,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(workload <= 1384.8000f) + if (workload <= 1384.8000f) { - if(workload <= 704.0000f) + if (workload <= 704.0000f) { return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 32, 0, 1, 0, 1, 0); } @@ -527,9 +475,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(workload <= 16761.6006f) + if (workload <= 16761.6006f) { - if(r_mn <= 187.1250f) + if (r_mn <= 187.1250f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 0, 0, 1, 1); } @@ -540,7 +488,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(r_mk <= 432.4630f) + if (r_mk <= 432.4630f) { return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 16, 0, 0, 0, 1, 1); } @@ -553,42 +501,37 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; const float r_mn = static_cast<float>(m) / static_cast<float>(n); const float r_mk = static_cast<float>(m) / static_cast<float>(k); const float r_nk = static_cast<float>(n) / static_cast<float>(k); - if(m == 1) + if (m == 1) { - const GeMMConfigsMatrix configs_mnkb_best = - { - { 1, 8984, 640, 1, 1, 4, 2, 1, 0, 1, 0, 1, 1, 0 }, - { 1, 420, 392, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 644, 5288, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 6512, 6404, 1, 1, 2, 2, 1, 0, 1, 0, 1, 1, 0 }, - { 1, 5304, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 4096, 25088, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 732, 8988, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 } - }; + const GeMMConfigsMatrix configs_mnkb_best = { + {1, 8984, 640, 1, 1, 4, 2, 1, 0, 1, 0, 1, 1, 0}, {1, 420, 392, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, + {1, 644, 5288, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, {1, 6512, 6404, 1, 1, 2, 2, 1, 0, 1, 0, 1, 1, 0}, + {1, 5304, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0}, {1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, + {1, 4096, 25088, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, {1, 732, 8988, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}}; return find_lhs_rhs_info(configs_mnkb_best, m, n, k, b); } else { - if(workload <= 1384.8000f) + if (workload <= 1384.8000f) { - if(r_nk <= 0.8333f) + if (r_nk <= 0.8333f) { - if(r_mk <= 0.9119f) + if (r_mk <= 0.9119f) { return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 4, 0, 1, 0, 1, 1); } else { - if(r_nk <= 0.1181f) + if (r_nk <= 0.1181f) { return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 32, 0, 0, 1, 0, 0); } @@ -600,7 +543,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(r_mk <= 1.0013f) + if (r_mk <= 1.0013f) { return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 1); } @@ -612,11 +555,11 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(workload <= 11404.7998f) + if (workload <= 11404.7998f) { - if(r_mk <= 2.2884f) + if (r_mk <= 2.2884f) { - if(r_nk <= 0.9286f) + if (r_nk <= 0.9286f) { return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 4, 0, 1, 1, 0, 1); } @@ -632,9 +575,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(r_nk <= 1.1926f) + if (r_nk <= 1.1926f) { - if(r_mn <= 1385.7917f) + if (r_mn <= 1385.7917f) { return configure_lhs_rhs_info(m, n, 6, 4, 8, 1, 4, 0, 1, 1, 0, 1); } @@ -652,12 +595,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { unsigned int best_m0; unsigned int best_n0; - if(is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0)) + if (is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0)) { return configure_lhs_rhs_info(m, n, best_m0, best_n0, 1, 1, 4, false, true, false, false, true); } @@ -667,153 +611,101 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { - const GeMMConfigsMatrix configs_1nkb_best = - { - { 1, 8984, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 6512, 6404, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 5304, 640, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 4096, 25088, 1, 1, 2, 8, 1, 0, 1, 0, 1, 1, 0 }, - { 1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 } + const GeMMConfigsMatrix configs_1nkb_best = { + {1, 8984, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0}, {1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}, + {1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}, {1, 6512, 6404, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, + {1, 5304, 640, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, {1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, + {1, 4096, 25088, 1, 1, 2, 8, 1, 0, 1, 0, 1, 1, 0}, {1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}}; + + const GeMMConfigsMatrix configs_mnkb_n_small_best = {{102400, 4, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}, + {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}, + {16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}, + {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}}; + + const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = { + {25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0}, {25584, 16, 68, 1, 2, 4, 16, 1, 8, 1, 1, 1, 0, 1}, + {369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}, + {23036, 56, 736, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {8944, 32, 776, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {2688, 136, 1492, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {50176, 64, 300, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1}, {16544, 104, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {12604, 60, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {3728, 96, 196, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0}, {12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0}, }; - const GeMMConfigsMatrix configs_mnkb_n_small_best = - { - { 102400, 4, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 }, - { 102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 }, - { 16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 }, - { 16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 } + const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = { + {25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0}, {25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0}, + {369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}, + {23036, 56, 736, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0}, {90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 0}, + {8944, 32, 776, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0}, {2688, 136, 1492, 1, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0}, + {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {16544, 104, 160, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0}, + {12604, 60, 160, 1, 2, 8, 8, 1, 8, 1, 1, 1, 0, 0}, {3728, 96, 196, 1, 2, 8, 8, 1, 64, 1, 1, 1, 0, 0}, + {29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0}, {12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0}, }; - const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = - { - { 25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0 }, - { 25584, 16, 68, 1, 2, 4, 16, 1, 8, 1, 1, 1, 0, 1 }, - { 369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 }, - { 23036, 56, 736, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 8944, 32, 776, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 2688, 136, 1492, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 50176, 64, 300, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1 }, - { 16544, 104, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 12604, 60, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 3728, 96, 196, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0 }, - { 12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0 }, - }; + const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = {{24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0}, + {49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0}, + {49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0}}; - const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = - { - { 25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0 }, - { 25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0 }, - { 369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 }, - { 23036, 56, 736, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0 }, - { 90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 0 }, - { 8944, 32, 776, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0 }, - { 2688, 136, 1492, 1, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0 }, - { 50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 16544, 104, 160, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0 }, - { 12604, 60, 160, 1, 2, 8, 8, 1, 8, 1, 1, 1, 0, 0 }, - { 3728, 96, 196, 1, 2, 8, 8, 1, 64, 1, 1, 1, 0, 0 }, - { 29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0 }, - { 12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0 }, - }; + const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = {{24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0}, + {49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0}, + {49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0}}; - const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = - { - { 24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0 }, - { 49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0 }, - { 49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0 } + const GeMMConfigsMatrix configs_mnkb_squared_best = { + {24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0}, {24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0}, + {72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0}, {268, 824, 5076, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {180, 420, 952, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1}, {1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0}, + {272, 400, 2116, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {196, 512, 512, 1, 5, 2, 8, 1, 4, 1, 1, 1, 1, 1}, }; - const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = - { - { 24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0 }, - { 49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0 }, - { 49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0 } - }; - - const GeMMConfigsMatrix configs_mnkb_squared_best = - { - { 24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 }, - { 24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 }, - { 72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0 }, - { 268, 824, 5076, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 180, 420, 952, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1 }, - { 1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0 }, - { 272, 400, 2116, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 196, 512, 512, 1, 5, 2, 8, 1, 4, 1, 1, 1, 1, 1 }, + const GeMMConfigsMatrix configs_mnkb_squared_fallback = { + {24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0}, {24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0}, + {72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0}, {268, 824, 5076, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}, + {180, 420, 952, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0}, {1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0}, + {272, 400, 2116, 1, 2, 8, 4, 1, 4, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0}, }; - const GeMMConfigsMatrix configs_mnkb_squared_fallback = - { - { 24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 }, - { 24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 }, - { 72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0 }, - { 268, 824, 5076, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 }, - { 180, 420, 952, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0 }, - { 1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0 }, - { 272, 400, 2116, 1, 2, 8, 4, 1, 4, 1, 1, 1, 0, 0 }, - { 196, 512, 512, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0 }, - }; + const GeMMConfigsMatrix configs_mnkb_best_batched = { + {3136, 64, 64, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 1}, {4096, 48, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 1}, {24, 464, 412, 24, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {112, 184, 144, 28, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {5776, 64, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {1568, 64, 40, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1}, {2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1}}; - const GeMMConfigsMatrix configs_mnkb_best_batched = - { - { 3136, 64, 64, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 1 }, - { 4096, 48, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 1 }, - { 24, 464, 412, 24, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 112, 184, 144, 28, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 5776, 64, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 1568, 64, 40, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1 }, - { 2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1 } - }; - - const GeMMConfigsMatrix configs_mnkb_fallback_batched = - { - { 3136, 64, 64, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 }, - { 4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0 }, - { 688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0 }, - { 24, 464, 412, 24, 2, 8, 4, 1, 32, 1, 1, 1, 0, 0 }, - { 112, 184, 144, 28, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0 }, - { 5776, 64, 32, 36, 2, 8, 8, 1, 32, 1, 1, 1, 0, 0 }, - { 1568, 64, 40, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0 }, - { 2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 } - }; + const GeMMConfigsMatrix configs_mnkb_fallback_batched = { + {3136, 64, 64, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}, {4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0}, + {688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0}, {24, 464, 412, 24, 2, 8, 4, 1, 32, 1, 1, 1, 0, 0}, + {112, 184, 144, 28, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 2, 8, 8, 1, 32, 1, 1, 1, 0, 0}, + {1568, 64, 40, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0}, {2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}}; const GeMMConfigsMatrix *configs_best_to_use = nullptr; const GeMMConfigsMatrix *configs_fallback_to_use = nullptr; - if(b == 1) + if (b == 1) { constexpr float ratio_m_gt_n = 10.f; constexpr float ratio_n_gt_m = 0.1f; constexpr unsigned int n_small_thr = 4; const float ratio = static_cast<float>(m) / static_cast<float>(n); - if(m == 1) + if (m == 1) { // We do not need fallback in this case, as we never use cl_image for the rhs tensor configs_best_to_use = &configs_1nkb_best; configs_fallback_to_use = &configs_1nkb_best; } - else if(n <= n_small_thr && ratio > ratio_m_gt_n) + else if (n <= n_small_thr && ratio > ratio_m_gt_n) { configs_best_to_use = &configs_mnkb_n_small_best; configs_fallback_to_use = &configs_mnkb_n_small_best; } - else if(ratio > ratio_m_gt_n) + else if (ratio > ratio_m_gt_n) { configs_best_to_use = &configs_mnkb_m_gt_n_best; configs_fallback_to_use = &configs_mnkb_m_gt_n_fallback; } - else if(ratio < ratio_n_gt_m) + else if (ratio < ratio_n_gt_m) { configs_best_to_use = &configs_mnkb_n_gt_m_best; configs_fallback_to_use = &configs_mnkb_n_gt_m_fallback; @@ -838,17 +730,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn std::tie(lhs_info0, rhs_info0) = find_lhs_rhs_info(*configs_best_to_use, m, n, k, b); std::tie(lhs_info1, rhs_info1) = find_lhs_rhs_info(*configs_fallback_to_use, m, n, k, b); - return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), - std::make_pair(lhs_info1, rhs_info1), - n, k, b, DataType::F16); + return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), std::make_pair(lhs_info1, rhs_info1), n, k, b, + DataType::F16); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { unsigned int best_m0; unsigned int best_n0; - if(is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0)) + if (is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0)) { return configure_lhs_rhs_info(m, n, best_m0, best_n0, 1, 1, 4, false, true, false, false, true); } diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h index f2952a3d30..a0ea337eb1 100644 --- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h +++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h @@ -45,17 +45,26 @@ public: ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu); // Inherited overridden method - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; private: - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); }; } // namespace gemm } // namespace kernels diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h index 1503e74eb6..e07ad993ed 100644 --- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h +++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h @@ -50,7 +50,7 @@ public: */ static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu) { - switch(get_arch_from_target(gpu)) + switch (get_arch_from_target(gpu)) { case GPUTarget::MIDGARD: case GPUTarget::BIFROST: |