aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp35
1 files changed, 26 insertions, 9 deletions
diff --git a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp
index 581c2d2199..f9b65dc931 100644
--- a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp
+++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp
@@ -149,34 +149,51 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfi
GEMMLHSMatrixInfo lhs_info_img;
GEMMRHSMatrixInfo rhs_info_img;
+ const bool is_workload_big = ((m * n * b) / 16) >= 2048;
// Get lhs_info/rhs_info in case of OpenCL buffer
if(m == 1)
{
- const unsigned int h0 = std::max(n / 2, 1U);
- std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
+ if((n / 4) >= 2048)
+ {
+ const unsigned int h0 = std::max(n / 4, 1U);
+ std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, h0, false, true, false, true);
+ }
+ else
+ {
+ const unsigned int h0 = std::max(n / 2, 1U);
+ std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
+ }
}
else
{
- std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true);
+ const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
+ if(is_workload_big)
+ {
+ std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true);
+ }
+ else
+ {
+ std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true);
+ }
}
// Get lhs_info/rhs_info in case of OpenCL image
- if(m == 1)
+ const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
+ if(is_workload_big)
{
- std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 4, false, true, false, false, true);
+ std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true);
}
else
{
- const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
- std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true);
+ std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true);
}
const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img);
const TensorInfo tensor_reshaped_info(shape, 1, DataType::F32);
- // In case of vector by matrix with few work-items, we use the OpenCL buffer rather than the OpenCL image2d
- const bool use_cl_image2d = (m == 1 && n <= 4096) ? false : true;
+ // In case of vector by matrix or small workloads, we use the OpenCL buffer rather than the OpenCL image2d
+ const bool use_cl_image2d = ((m == 1) || ((((m * n * b) / 16) < 2048) && n < 128)) ? false : true;
if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
{