1 files changed, 49 insertions, 10 deletions
diff --git a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp
index 11cb90ba19..9f3461e912 100644
--- a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp
+++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp
@@ -27,6 +27,9 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include <map>
 #include <utility>
@@ -35,6 +38,8 @@ namespace arm_compute
 {
 namespace cl_gemm
 {
+using namespace arm_compute::misc::shape_calculator;
+
 CLGEMMReshapedOnlyRHSKernelConfigurationValhall::CLGEMMReshapedOnlyRHSKernelConfigurationValhall(GPUTarget gpu)
     : ICLGEMMKernelConfiguration(gpu)
 {
@@ -74,32 +79,66 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfi
 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
 
+    GEMMLHSMatrixInfo lhs_info_buf;
+    GEMMRHSMatrixInfo rhs_info_buf;
+    GEMMLHSMatrixInfo lhs_info_img;
+    GEMMRHSMatrixInfo rhs_info_img;
+
+    // Get lhs_info/rhs_info in case of OpenCL buffer
     if(m == 1)
     {
-        if(n > 2048)
+        const unsigned int h0 = std::max(n / 4, 1U);
+        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true);
+    }
+    else
+    {
+        if(m > 256)
         {
-            return configure_lhs_rhs_info(m, n, 1, 8, 2, 1, 256, false, true, false, true);
+            const int v0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(8)), static_cast<int>(1));
+            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, v0, false, true, false, true);
         }
         else
         {
-            return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 256, false, true, false, true);
+            const int v0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(8)), static_cast<int>(1));
+            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 4, 4, 1, v0, false, true, false, true);
         }
     }
+
+    // Get lhs_info/rhs_info in case of OpenCL image
+    if(m == 1)
+    {
+        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 8, true, true, false, false, true);
+    }
     else
     {
-        if(m > 300)
+        if((m / 4) * (n / 4) > 4096)
         {
-            const int v0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
-            return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, v0, false, true, false, true);
+            const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(8)), static_cast<int>(1));
+            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true);
         }
         else
         {
-            const int v0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
-            return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, v0, false, true, false, true);
+            const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(8)), static_cast<int>(1));
+            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 4, 1, h0, false, true, false, false, true);
         }
     }
+
+    const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
+    const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img);
+    const TensorInfo  tensor_reshaped_info(shape, 1, DataType::F32);
+
+    // In case of small workloads, we use the OpenCL buffer rather than the OpenCL image2d
+    const bool use_cl_image2d = ((m / lhs_info_img.m0) * (n / rhs_info_img.n0)) * b < 1024 ? false : true;
+
+    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
+    {
+        return std::make_pair(lhs_info_img, rhs_info_img);
+    }
+    else
+    {
+        return std::make_pair(lhs_info_buf, rhs_info_buf);
+    }
 }
 
 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
@@ -120,7 +159,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfi
             return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
         }
     }
-    else if (m < 128)
+    else if(m < 128)
     {
         const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
         if(k >= 512)