From 926afe1c8ad6ba6a7bada62a4027fcb79d727104 Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Tue, 19 Mar 2019 11:44:13 +0000
Subject: COMPMID-2097: Implement a heuristic to dispatch CLGEMMReshapedOnlyRHS
 kernel from CLGEMM

Change-Id: I4170a80647b02501aa669e2c0347ddc39888ee76
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://review.mlplatform.org/c/928
Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 src/runtime/CL/functions/CLGEMM.cpp                | 620 +++++++++++++++------
 .../CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp  |  18 +-
 .../CLGEMMReshapedConfigurationBifrost.cpp         | 168 ------
 3 files changed, 452 insertions(+), 354 deletions(-)
 delete mode 100644 src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp

(limited to 'src/runtime/CL')
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 2ac6f815a4..60bfbf24e5 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -23,7 +23,10 @@
  */
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 
+#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h"
+#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/Helpers.h"
@@ -33,7 +36,6 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 
 namespace arm_compute
@@ -41,104 +43,109 @@ namespace arm_compute
 using namespace arm_compute::misc::shape_calculator;
 using namespace arm_compute::cl_gemm;
 
-namespace
+CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)),
+      _mm_kernel(),
+      _ma_kernel(),
+      _reshape_lhs_kernel(),
+      _reshape_rhs_kernel(),
+      _mm_reshaped_kernel(),
+      _mm_reshaped_only_rhs_kernel(),
+      _tmp_a(),
+      _tmp_b(),
+      _original_b(nullptr),
+      _run_addition(false),
+      _reshape_b_only_on_first_run(false),
+      _is_prepared(false),
+      _gemm_type(GEMMType::NATIVE)
 {
-inline bool is_interleaved_transposed(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+}
+
+CLGEMM::GEMMType CLGEMM::select_gemm_type(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
 {
-    bool flag = true;
+    GEMMType gemm_type = GEMMType::RESHAPED_V1;
 
     if(gpu_target_is_in(gpu_target, GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
     {
-        if((m > 1) && n < 16)
+        if((m > 1) && (n < 16))
         {
-            flag = true;
+            gemm_type = GEMMType::RESHAPED_V1;
+        }
+        else if((m == 1) && (data_type == DataType::F32))
+        {
+            gemm_type = GEMMType::RESHAPED_ONLY_RHS;
         }
         else
         {
             // COMPMID-852
-            if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
+            if((k > 256) && (m > 4) && is_data_type_float(data_type) && reshape_b_only_on_first_run)
             {
                 constexpr float alpha = 3.2f;
                 constexpr float fact0 = 1.51f;
                 constexpr float fact1 = 1.66f;
                 constexpr float ops   = 12.0f;
                 const float     scale = k > 1024 ? 1.07f : 1.0f;
-                flag                  = alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops);
+                gemm_type             = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? GEMMType::RESHAPED_V1 : GEMMType::NATIVE;
             }
             else
             {
-                flag = false;
+                gemm_type = GEMMType::NATIVE;
             }
         }
+
+        const auto workload = static_cast<float>((m * n) / 20.0f);
+
+        gemm_type = ((workload > 1600.0f) && (gemm_type == GEMMType::RESHAPED_V1) && (data_type == DataType::F32)) ? GEMMType::RESHAPED_V2 : gemm_type;
     }
     else
     {
         // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
-        flag = m != 1 && reshape_b_only_on_first_run;
+        gemm_type = ((m != 1) && reshape_b_only_on_first_run) ? GEMMType::RESHAPED_V1 : GEMMType::NATIVE;
     }
 
-    return flag;
+    return gemm_type;
 }
-} // namespace
 
-CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)),
-      _mm_kernel(),
-      _ma_kernel(),
-      _reshape_lhs_kernel(),
-      _reshape_rhs_kernel(),
-      _mm_reshaped_kernel(),
-      _tmp_a(),
-      _tmp_b(),
-      _original_b(nullptr),
-      _is_interleaved_transposed(false),
-      _run_addition(false),
-      _reshape_b_only_on_first_run(false),
-      _is_prepared(false),
-      _is_new_gemm_reshaped(false)
-{
-}
-
-void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure_native(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+    const unsigned int m          = gemm_info.reinterpret_input_as_3d() ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+    const unsigned int n          = b->info()->dimension(0);
+    const unsigned int k          = a->info()->dimension(0);
+    const GPUTarget    gpu_target = CLScheduler::get().target();
 
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
-
-    // Check if we need to reshape the matrix B only on the first run
-    _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
-    _is_prepared                 = gemm_info.retain_internal_weights();
-    _original_b                  = b;
+    // Set the target for the kernels
+    _mm_kernel.set_target(gpu_target);
 
-    const ICLTensor *matrix_a = a;
-    const ICLTensor *matrix_b = b;
+    GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d());
 
-    // Get the GPU target
-    const GPUTarget gpu_target = CLScheduler::get().target();
+    // Configure and tune matrix multiply kernel
+    _mm_kernel.configure(a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision());
 
-    // Set the target for the kernels
-    _reshape_lhs_kernel.set_target(gpu_target);
-    _mm_kernel.set_target(gpu_target);
+    // Tune kernel statically
+    CLScheduler::get().tune_kernel_static(_mm_kernel);
+}
 
-    // Arguments used by GEMMReshapeInfo
-    // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
-    // in order to know how the matrices have been reshaped
-    DataType           data_type                 = a->info()->data_type();
+void CLGEMM::configure_reshaped_v1(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
     bool               reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
     const unsigned int m                         = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
     const unsigned int n                         = b->info()->dimension(0);
     const unsigned int k                         = a->info()->dimension(0);
-    const unsigned int batch_size                = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
     const int          depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target                = CLScheduler::get().target();
     int                mult_transpose1xW_width   = 1;
     int                mult_interleave4x4_height = 1;
 
+    // Set the target for the kernels
+    _reshape_lhs_kernel.set_target(gpu_target);
+    _mm_kernel.set_target(gpu_target);
+
     if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
     {
         mult_transpose1xW_width   = 4;
         mult_interleave4x4_height = 2;
     }
+
     GEMMRHSMatrixInfo rhs_info;
     rhs_info.n0         = 16 / b->info()->element_size();
     rhs_info.k0         = 1;
@@ -153,112 +160,183 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
     lhs_info.interleave = true;
     lhs_info.transpose  = true;
 
-    // Check if we need to reshape the matrix A and matrix B
-    _is_interleaved_transposed = is_interleaved_transposed(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
+    GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false);
 
-    // Check if we can run the new reshaped GEMM
-    const auto workload   = static_cast<float>((m * n) / 20.0f);
-    _is_new_gemm_reshaped = (workload > 1600.0f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && _is_interleaved_transposed && (data_type == DataType::F32);
+    _memory_group.manage(&_tmp_a);
+    if(!_reshape_b_only_on_first_run)
+    {
+        _memory_group.manage(&_tmp_b);
+    }
 
-    const bool add_matrix_c  = (beta != 0.f && c != nullptr);
-    const bool is_beta_one   = std::abs(1.0f - beta) < 0.00001f;
-    const bool use_fused_add = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !_is_new_gemm_reshaped;
+    // Configure interleave kernel
+    _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
 
-    // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
-    if(_is_interleaved_transposed)
-    {
-        reinterpret_input_as_3d = false;
+    // Configure transpose kernel
+    _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
 
-        matrix_a = &_tmp_a;
-        matrix_b = &_tmp_b;
+    // Configure and tune matrix multiply kernel
+    _mm_kernel.configure(&_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision());
 
-        // Manage intermediate buffers
-        _memory_group.manage(&_tmp_a);
-        if(!_reshape_b_only_on_first_run)
-        {
-            _memory_group.manage(&_tmp_b);
-        }
-        // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
+    CLScheduler::get().tune_kernel_static(_mm_kernel);
 
-        if(_is_new_gemm_reshaped)
-        {
-            GEMMLHSMatrixInfo lhs_info;
+    // Allocate intermediate tensors
+    _tmp_a.allocator()->allocate();
+    if(!_reshape_b_only_on_first_run)
+    {
+        _tmp_b.allocator()->allocate();
+    }
+}
 
-            // Pick up the GEMM configuration
-            std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
+void CLGEMM::configure_reshaped_v2(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON(c != nullptr);
+    ARM_COMPUTE_UNUSED(beta);
+    ARM_COMPUTE_UNUSED(c);
+
+    DataType           data_type               = a->info()->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+    const unsigned int n                       = b->info()->dimension(0);
+    const unsigned int k                       = a->info()->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
 
-            _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
-            _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+    // Set the target for the kernels
+    _reshape_lhs_kernel.set_target(gpu_target);
+    _mm_kernel.set_target(gpu_target);
 
-            // Configure and tune matrix multiply kernel
-            _mm_reshaped_kernel.configure(matrix_a, matrix_b, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
-                                                                                                                 depth_output_gemm3d, reinterpret_input_as_3d));
-        }
-        else
-        {
-            // Configure interleave kernel
-            _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
-            // Configure transpose kernel
-            _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
-        }
+    GEMMReshapeInfo reshape_info(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+    // Manage intermediate buffers
+    _memory_group.manage(&_tmp_a);
+    if(!_reshape_b_only_on_first_run)
+    {
+        _memory_group.manage(&_tmp_b);
     }
+    // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
+
+    GEMMLHSMatrixInfo lhs_info{};
+    GEMMRHSMatrixInfo rhs_info{};
+
+    // Pick up the GEMM configuration
+    std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedKernelConfigurationFactory::create(gpu_target);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
+
+    // Configure lhs_info and rhs_info
+    std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+    _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+    _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+
+    // Configure and tune matrix multiply kernel
+    _mm_reshaped_kernel.configure(&_tmp_a, &_tmp_b, output, alpha, lhs_info, rhs_info, reshape_info);
 
-    if(!_is_new_gemm_reshaped)
+    // Allocate intermediate tensors
+    _tmp_a.allocator()->allocate();
+    if(!_reshape_b_only_on_first_run)
     {
-        // Configure and tune matrix multiply kernel
-        _mm_kernel.configure(matrix_a, matrix_b, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta, _is_interleaved_transposed,
-                             GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d),
-                             gemm_info.fp_mixed_precision());
-        CLScheduler::get().tune_kernel_static(_mm_kernel);
+        _tmp_b.allocator()->allocate();
     }
+}
+
+void CLGEMM::configure_reshaped_only_rhs(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON(c != nullptr);
+    ARM_COMPUTE_UNUSED(beta);
+    ARM_COMPUTE_UNUSED(c);
+
+    DataType           data_type               = a->info()->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+    const unsigned int n                       = b->info()->dimension(0);
+    const unsigned int k                       = a->info()->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+
+    // Set the target for the kernels
+    _mm_kernel.set_target(gpu_target);
+
+    GEMMReshapeInfo reshape_info(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
 
-    if(_is_interleaved_transposed)
+    // Manage intermediate buffers
+    if(!_reshape_b_only_on_first_run)
     {
-        // Allocate intermediate tensors
-        _tmp_a.allocator()->allocate();
-        if(!_reshape_b_only_on_first_run)
-        {
-            _tmp_b.allocator()->allocate();
-        }
+        _memory_group.manage(&_tmp_b);
     }
 
-    // Configure matrix addition kernel
-    if(add_matrix_c && !use_fused_add)
+    GEMMLHSMatrixInfo lhs_info{};
+    GEMMRHSMatrixInfo rhs_info{};
+
+    // Pick up the GEMM configuration
+    std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
+
+    // Configure lhs_info and rhs_info
+    std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+    _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+
+    // Configure and tune matrix multiply kernel
+    _mm_reshaped_only_rhs_kernel.configure(a, &_tmp_b, output, alpha, lhs_info, rhs_info, reshape_info);
+
+    if(!_reshape_b_only_on_first_run)
     {
-        _ma_kernel.configure(c, output, beta);
-        _run_addition = true;
+        _tmp_b.allocator()->allocate();
     }
 }
 
-Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status CLGEMM::validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_UNUSED(output);
 
-    // Check if we need to reshape the matrix B only on the first run
-    const bool reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n                       = b->dimension(0);
+    const unsigned int k                       = a->dimension(0);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const bool         add_c                   = (beta != 0.f && c != nullptr);
+    const bool         is_beta_one             = std::abs(1.0f - beta) < 0.00001f;
+    const bool         fuse_add                = is_beta_one && (c != nullptr && c->num_dimensions() == 1);
+
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+    // Validate matrix multiply
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta,
+                                                                     false, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+
+    if(add_c && !fuse_add)
+    {
+        // Validate matrix addition kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
+    }
 
-    const ITensorInfo *matrix_a_info = a;
-    const ITensorInfo *matrix_b_info = b;
+    return Status{};
+}
+
+Status CLGEMM::validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(output);
 
     TensorInfo tmp_a_info{};
     TensorInfo tmp_b_info{};
 
     // Get the GPU target
-    const GPUTarget gpu_target = CLScheduler::get().target();
-
-    // Arguments used by GEMMReshapeInfo
-    // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
-    // in order to know how the matrices have been reshaped
-    DataType           data_type                 = a->data_type();
-    bool               reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                         = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const GPUTarget    gpu_target                = CLScheduler::get().target();
+    const unsigned int m                         = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
     const unsigned int n                         = b->dimension(0);
     const unsigned int k                         = a->dimension(0);
-    const unsigned int batch_size                = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
     int                mult_transpose1xW_width   = 1;
     int                mult_interleave4x4_height = 1;
     const int          depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
+    const bool         add_c                     = (beta != 0.f && c != nullptr);
+    const bool         is_beta_one               = std::abs(1.0f - beta) < 0.00001f;
+    const bool         fuse_add                  = is_beta_one && (c != nullptr && c->num_dimensions() == 1);
 
     if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
     {
@@ -280,69 +358,224 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
     lhs_info.interleave = true;
     lhs_info.transpose  = true;
 
-    // Check if we need to reshape the matrix A and matrix B
-    const bool run_interleave_transpose = is_interleaved_transposed(m, n, k, a->data_type(), reshape_b_only_on_first_run, gpu_target);
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false);
 
-    // Check if we can run the new reshaped GEMM
-    const auto workload             = static_cast<float>((m * n) / 20.0f);
-    const bool is_new_gemm_reshaped = (workload > 1600.f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && run_interleave_transpose && (data_type == DataType::F32);
+    // Validate interleave kernel
+    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
 
-    const bool add_matrix_c  = (beta != 0.f && c != nullptr);
-    const bool is_beta_one   = std::abs(1.0f - beta) < 0.00001f;
-    const bool use_fused_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1) && !is_new_gemm_reshaped;
+    // Validate transpose kernel
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
 
-    // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
-    if(run_interleave_transpose)
+    // Validate matrix multiply
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, (add_c && fuse_add) ? c : nullptr, output, alpha, beta,
+                                                                     true, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+
+    if(add_c && !fuse_add)
     {
-        reinterpret_input_as_3d = false;
+        // Validate matrix addition kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
     }
 
-    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d);
+    return Status{};
+}
+
+Status CLGEMM::validate_reshaped_v2(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(output);
+
+    TensorInfo tmp_a_info{};
+    TensorInfo tmp_b_info{};
+
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    DataType           data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n                       = b->dimension(0);
+    const unsigned int k                       = a->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const bool         add_c                   = (beta != 0.f && c != nullptr);
+
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, false);
+
+    GEMMLHSMatrixInfo lhs_info;
+    GEMMRHSMatrixInfo rhs_info;
+
+    // Pick up the GEMM configuration
+    std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedKernelConfigurationFactory::create(gpu_target);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(gemm_config.get());
 
-    if(run_interleave_transpose)
+    // Configure lhs_info and rhs_info
+    std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+    // Validate matrix multiply
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, output, alpha, lhs_info, rhs_info, reshape_info));
+
+    if(add_c)
     {
-        matrix_a_info = &tmp_a_info;
-        matrix_b_info = &tmp_b_info;
+        // Validate matrix addition kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
+    }
 
-        if(is_new_gemm_reshaped)
-        {
-            GEMMLHSMatrixInfo lhs_info;
+    return Status{};
+}
+
+Status CLGEMM::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(output);
+
+    TensorInfo tmp_b_info{};
+
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    const DataType     data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n                       = b->dimension(0);
+    const unsigned int k                       = a->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const bool         add_c                   = (beta != 0.f && c != nullptr);
+
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+    GEMMLHSMatrixInfo lhs_info;
+    GEMMRHSMatrixInfo rhs_info;
+
+    // Pick up the GEMM configuration
+    std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(gemm_config.get());
+
+    // Configure lhs_info and rhs_info
+    std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+    // Validate matrix multiply
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, output, alpha, lhs_info, rhs_info, reshape_info));
+
+    if(add_c)
+    {
+        // Validate matrix addition kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
+    }
+
+    return Status{};
+}
+
+void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
+
+    // Check if we need to reshape the matrix B only on the first run
+    _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+    _is_prepared                 = gemm_info.retain_internal_weights();
+    _original_b                  = b;
 
-            // Pick up the GEMM configuration
-            std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+    const unsigned int n                       = b->info()->dimension(0);
+    const unsigned int k                       = a->info()->dimension(0);
 
-            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+    // Select GEMMType
+    _gemm_type = select_gemm_type(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
 
-            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+    const bool is_gemm_v2  = (_gemm_type == GEMMType::RESHAPED_V2) || (_gemm_type == GEMMType::RESHAPED_ONLY_RHS);
+    const bool add_c       = (beta != 0.f && c != nullptr);
+    const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
+    const bool fuse_add    = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !is_gemm_v2;
 
-            // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
-                                                                                     depth_output_gemm3d, reinterpret_input_as_3d)));
+    switch(_gemm_type)
+    {
+        case GEMMType::NATIVE:
+        {
+            configure_native(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+            break;
         }
-        else
+        case GEMMType::RESHAPED_V1:
+        {
+            configure_reshaped_v1(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+            break;
+        }
+        case GEMMType::RESHAPED_V2:
         {
-            // Validate interleave kernel
-            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
-            // Validate transpose kernel
-            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+            configure_reshaped_v2(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+            break;
+        }
+        case GEMMType::RESHAPED_ONLY_RHS:
+        {
+            configure_reshaped_only_rhs(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("GEMMType not supported");
         }
     }
 
-    if(!is_new_gemm_reshaped)
+    // Configure matrix addition kernel
+    if(add_c && !fuse_add)
     {
-        // Validate matrix multiply
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta,
-                                                                         run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+        _ma_kernel.configure(c, output, beta);
+        _run_addition = true;
     }
+}
 
-    if(add_matrix_c && !use_fused_add)
+Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n                       = b->dimension(0);
+    const unsigned int k                       = a->dimension(0);
+
+    // Select GEMMType
+    GEMMType gemm_type = select_gemm_type(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run(), gpu_target);
+
+    switch(gemm_type)
     {
-        // Validate matrix addition kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
+        case GEMMType::NATIVE:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_native(a, b, c, output, alpha, beta, gemm_info));
+            break;
+        }
+        case GEMMType::RESHAPED_V1:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c, output, alpha, beta, gemm_info));
+            break;
+        }
+        case GEMMType::RESHAPED_V2:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v2(a, b, c, output, alpha, beta, gemm_info));
+            break;
+        }
+        case GEMMType::RESHAPED_ONLY_RHS:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c, output, alpha, beta, gemm_info));
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_RETURN_ERROR_MSG("GEMMType not supported");
+        }
     }
 
     return Status{};
@@ -354,26 +587,57 @@ void CLGEMM::run()
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_is_interleaved_transposed)
+    // Run matrix multiply kernel
+    switch(_gemm_type)
     {
-        // Run interleave kernel
-        CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
+        case GEMMType::NATIVE:
+        {
+            CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+            break;
+        }
+        case GEMMType::RESHAPED_V1:
+        {
+            // Run interleave kernel
+            CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
 
-        if(!_reshape_b_only_on_first_run)
+            if(!_reshape_b_only_on_first_run)
+            {
+                // Run transpose kernel
+                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+            }
+
+            CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+            break;
+        }
+        case GEMMType::RESHAPED_V2:
         {
-            // Run transpose kernel
-            CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+            // Run interleave kernel
+            CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
+
+            if(!_reshape_b_only_on_first_run)
+            {
+                // Run transpose kernel
+                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+            }
+
+            CLScheduler::get().enqueue(_mm_reshaped_kernel, !_run_addition);
+            break;
         }
-    }
+        case GEMMType::RESHAPED_ONLY_RHS:
+        {
+            if(!_reshape_b_only_on_first_run)
+            {
+                // Run transpose kernel
+                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+            }
 
-    // Run matrix multiply kernel
-    if(_is_new_gemm_reshaped)
-    {
-        CLScheduler::get().enqueue(_mm_reshaped_kernel, !_run_addition);
-    }
-    else
-    {
-        CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+            CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, !_run_addition);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("GEMMType not supported");
+        }
     }
 
     // Run matrix addition kernel
@@ -387,7 +651,7 @@ void CLGEMM::prepare()
 {
     if(!_is_prepared)
     {
-        if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+        if(_gemm_type != GEMMType::NATIVE && _reshape_b_only_on_first_run)
         {
             // Run transpose kernel and mark original weights tensor as unused
             _tmp_b.allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index c0bd85dcb5..c447cb8778 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -31,7 +32,6 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
 
 namespace arm_compute
 {
@@ -122,12 +122,12 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
         }
 
         // Pick up the GEMM configuration
-        std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+        std::tie(lhs_info, rhs_info) = CLGEMMReshapedKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
 
-        // Configure interleave kernel
+        // Configure reshape LHS kernel
         _mtx_a_reshape_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
 
-        // Configure transpose kernel
+        // Configure reshape RHS kernel
         _mtx_b_reshape_kernel.configure(b, &_tmp_b, rhs_info);
     }
 
@@ -236,6 +236,9 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
     GEMMRHSMatrixInfo rhs_info;
     GEMMLHSMatrixInfo lhs_info;
 
+    // Get the GPU target
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
     const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
     const unsigned int n                       = b->dimension(0);
@@ -259,14 +262,13 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
         matrix_b_info = &tmp_b_info;
 
         // Pick up the GEMM configuration
-        std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+        std::tie(lhs_info, rhs_info) = CLGEMMReshapedKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
 
-        // Validate interleave kernel
+        // Validate reshape LHS kernel
         auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
 
-        // Validate transpose kernel
-
+        // Validate reshape RHS kernel
         auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
     }
diff --git a/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp b/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp
deleted file mode 100644
index cd97849712..0000000000
--- a/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.h"
-
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-namespace
-{
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_gemm_reshaped(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
-                                                                        bool lhs_interleave, bool rhs_interleave)
-{
-    GEMMLHSMatrixInfo lhs_info;
-    GEMMRHSMatrixInfo rhs_info;
-
-    // Configure GEMMLHSMatrixInfo
-    lhs_info.m0         = m0;
-    lhs_info.k0         = k0;
-    lhs_info.v0         = ((m / (lhs_info.m0 * v0)) == 0) ? 1 : v0;
-    lhs_info.interleave = lhs_interleave;
-    lhs_info.transpose  = false;
-
-    // Configure GEMMRHSMatrixInfo
-    rhs_info.n0         = n0;
-    rhs_info.k0         = lhs_info.k0;
-    rhs_info.h0         = ((n / (rhs_info.n0 * h0)) == 0) ? 1 : h0;
-    rhs_info.interleave = rhs_interleave;
-    rhs_info.transpose  = true;
-
-    return std::make_pair(lhs_info, rhs_info);
-}
-
-} // namespace
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
-    ARM_COMPUTE_ERROR_ON(data_type != DataType::F32 && data_type != DataType::QASYMM8);
-    ARM_COMPUTE_UNUSED(data_type);
-
-    const GPUTarget gpu_target = CLScheduler::get().target();
-
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (CLGEMMReshapedConfigurationBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-
-    // Configurations for Mali-G76
-    static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_reshaped_configs_G76 =
-    {
-        { DataType::F32, &CLGEMMReshapedConfigurationBifrost::configure_G76_f32 },
-        { DataType::QASYMM8, &CLGEMMReshapedConfigurationBifrost::configure_G76_u8 }
-    };
-
-    // Configurations for Mali-G7x
-    static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_reshaped_configs_G7x =
-    {
-        { DataType::F32, &CLGEMMReshapedConfigurationBifrost::configure_G7x_f32 },
-        { DataType::QASYMM8, &CLGEMMReshapedConfigurationBifrost::configure_G7x_u8 }
-    };
-
-    switch(gpu_target)
-    {
-        case GPUTarget::G76:
-            return (this->*gemm_reshaped_configs_G76[data_type])(m, n, k, b);
-        default:
-            return (this->*gemm_reshaped_configs_G7x[data_type])(m, n, k, b);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(n <= 4)
-    {
-        return configure_gemm_reshaped(m, n, 4, 2, 8, 16, 16, true, false);
-    }
-    else
-    {
-        return configure_gemm_reshaped(m, n, 5, 4, 4, 2, 16, false, true);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
-    {
-        if(n <= 4)
-        {
-            return configure_gemm_reshaped(m, n, 4, 2, 16, 2, 2, true, false);
-        }
-        else
-        {
-            return configure_gemm_reshaped(m, n, 4, 4, 16, 2, 2, true, false);
-        }
-    }
-    else
-    {
-        if(n <= 4)
-        {
-            return configure_gemm_reshaped(m, n, 4, 2, 8, 2, 2, true, false);
-        }
-        else
-        {
-            return configure_gemm_reshaped(m, n, 6, 4, 4, 2, 2, true, true);
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(n <= 4)
-    {
-        return configure_gemm_reshaped(m, n, 4, 2, 8, 16, 16, true, false);
-    }
-    else
-    {
-        return configure_gemm_reshaped(m, n, 4, 4, 2, 8, 16, false, false);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(n <= 4)
-    {
-        return configure_gemm_reshaped(m, n, 4, 2, 16, 4, 1, false, false);
-    }
-    else
-    {
-        return configure_gemm_reshaped(m, n, 4, 4, 16, 2, 2, false, true);
-    }
-}
-} // namespace cl_gemm
-} // namespace arm_compute
\ No newline at end of file
-- 
cgit v1.2.1