COMPMID-2097: Implement a heuristic to dispatch CLGEMMReshapedOnlyRHS kernel from CLGEMM

Change-Id: I4170a80647b02501aa669e2c0347ddc39888ee76 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/928 Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
author: Gian Marco Iodice <gianmarco.iodice@arm.com> 2019-03-19 11:44:13 +0000
committer: Gian Marco Iodice <gianmarco.iodice@arm.com> 2019-04-08 14:12:59 +0000
commit: 926afe1c8ad6ba6a7bada62a4027fcb79d727104 (patch)
tree: 8dcc908a6145de6b02bcea24e3ccd830ba3f5939
parent: 8c571692a8236be8605a753e231d240094428be5 (diff)
download: ComputeLibrary-926afe1c8ad6ba6a7bada62a4027fcb79d727104.tar.gz
16 files changed, 1134 insertions, 417 deletions
diff --git a/SConscript b/SConscript
index 9045e9d1af..d63206b765 100644
--- a/SConscript
+++ b/SConscript
@@ -186,11 +186,13 @@ if env['openmp']:
 if env['opencl']:
     core_files += Glob('src/core/CL/*.cpp')
     core_files += Glob('src/core/CL/kernels/*.cpp')
+    core_files += Glob('src/core/CL/gemm/*.cpp')
+    core_files += Glob('src/core/CL/gemm/reshaped/*.cpp')
+    core_files += Glob('src/core/CL/gemm/reshaped_only_rhs/*.cpp')
 
     runtime_files += Glob('src/runtime/CL/*.cpp')
     runtime_files += Glob('src/runtime/CL/functions/*.cpp')
     runtime_files += Glob('src/runtime/CL/tuners/*.cpp')
-    runtime_files += Glob('src/runtime/CL/gemm_reshaped/*.cpp')
 
     graph_files += Glob('src/graph/backends/CL/*.cpp')
 
diff --git a/arm_compute/runtime/CL/ICLGEMMReshapedConfiguration.h b/arm_compute/core/CL/ICLGEMMKernelConfiguration.h
index 500d9cd492..2e6d49566c 100644
--- a/arm_compute/runtime/CL/ICLGEMMReshapedConfiguration.h
+++ b/arm_compute/core/CL/ICLGEMMKernelConfiguration.h
@@ -21,20 +21,37 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_ICLGEMMRESHAPEDCONFIGURATION_H__
-#define __ARM_COMPUTE_ICLGEMMRESHAPEDCONFIGURATION_H__
+#ifndef __ARM_COMPUTE_ICLGEMMKERNELCONFIGURATION_H__
+#define __ARM_COMPUTE_ICLGEMMKERNELCONFIGURATION_H__
 
+#include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
 {
-/** Basic interface for the GEMM selection */
-class ICLGEMMReshapedConfiguration
+/** Basic interface for the GEMM kernel configuration */
+class ICLGEMMKernelConfiguration
 {
 public:
+    /** Constructor
+     *
+     * @param[in] arch GPU target
+     */
+    ICLGEMMKernelConfiguration(GPUTarget arch)
+        : _target(arch)
+    {
+    }
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ICLGEMMKernelConfiguration(const ICLGEMMKernelConfiguration &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ICLGEMMKernelConfiguration &operator=(const ICLGEMMKernelConfiguration &) = delete;
+    /** Default Move Constructor. */
+    ICLGEMMKernelConfiguration(ICLGEMMKernelConfiguration &&) = default;
+    /** Default move assignment operator */
+    ICLGEMMKernelConfiguration &operator=(ICLGEMMKernelConfiguration &&) = default;
     /** Virtual destructor */
-    virtual ~ICLGEMMReshapedConfiguration() = default;
-    /** Given M, N, K and B, this method returns the @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo to be used with @ref CLGEMMMatrixMultiplyReshapedKernel
+    virtual ~ICLGEMMKernelConfiguration() = default;
+    /** Given M, N, K and B, this method returns the @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo to be used
      *
      * @param[in] m         Number of rows LHS matrix
      * @param[in] n         Number of columns RHS matrix
@@ -43,6 +60,9 @@ public:
      * @param[in] data_type Data type
      */
     virtual std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0;
+
+protected:
+    GPUTarget _target;
 };
 } // namespace arm_compute
-#endif /*__ARM_COMPUTE_ICLGEMMRESHAPEDCONFIGURATION_H__ */
+#endif /*__ARM_COMPUTE_ICLGEMMKERNELCONFIGURATION_H__ */
diff --git a/arm_compute/core/CL/gemm/CLGEMMHelpers.h b/arm_compute/core/CL/gemm/CLGEMMHelpers.h
new file mode 100644
index 0000000000..d263712397
--- /dev/null
+++ b/arm_compute/core/CL/gemm/CLGEMMHelpers.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMHELPERS_H__
+#define __ARM_COMPUTE_CLGEMMHELPERS_H__
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** Configure @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
+ *
+ * @param[in] m              Number of rows (M) in the LHS matrix not reshaped
+ * @param[in] n              Number of columns (N) in the RHS matrix not reshaped
+ * @param[in] m0             Number of rows processed by each thread/work-item
+ * @param[in] n0             Number of columns processed by each thread/work-item
+ * @param[in] k0             Number of inner accumulation performed by each thread/work-item
+ * @param[in] v0             Number of vertical blocks of size (m0xk0) stored on the same output row
+ * @param[in] h0             Number of horizontal blocks of size (k0xn0) stored on the same output row
+ * @param[in] lhs_interleave True if the v0 (m0xk0) blocks have to be interleaved in the output row
+ * @param[in] rhs_interleave True if the h0 (k0xn0) blocks have to be interleaved in the output row
+ * @param[in] lhs_transpose  True if the (m0xk0) block has to be transposed before been stored
+ * @param[in] rhs_transpose  True if the (k0xn0) block has to be transposed before been stored
+ *
+ * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
+ */
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
+                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose);
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLGEMMHELPERS_H__ */
diff --git a/arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
index 3458911a97..105a58a6f8 100644
--- a/arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h
+++ b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
@@ -21,12 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_CLGEMMRESHAPEDCONFIGURATION_H__
-#define __ARM_COMPUTE_CLGEMMRESHAPEDCONFIGURATION_H__
+#ifndef __ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATION_H__
+#define __ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATION_H__
 
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/ICLGEMMReshapedConfiguration.h"
-#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.h"
+#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
+#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h"
 
 #include <memory>
 
@@ -34,23 +33,27 @@ namespace arm_compute
 {
 namespace cl_gemm
 {
-/** Tuner factory class */
-class CLGEMMReshapedConfigurationFactory final
+/** CLGEMMReshaped factory class */
+class CLGEMMReshapedKernelConfigurationFactory final
 {
 public:
-    static std::unique_ptr<ICLGEMMReshapedConfiguration> create()
+    /** Static method to call the CLGEMMReshaped kernel configuration class accordingly with the GPU architecture
+     *
+     * @param[in] arch GPU target
+     *
+     * @return CLGEMMReshaped kernel configuration class
+     */
+    static std::unique_ptr<ICLGEMMKernelConfiguration> create(GPUTarget arch)
     {
-        GPUTarget arch = get_arch_from_target(CLScheduler::get().target());
-
-        switch(arch)
+        switch(get_arch_from_target(arch))
         {
             case GPUTarget::BIFROST:
-                return support::cpp14::make_unique<CLGEMMReshapedConfigurationBifrost>();
+                return support::cpp14::make_unique<CLGEMMReshapedKernelConfigurationBifrost>(arch);
             default:
                 return nullptr;
         }
     }
 };
-} // namespace tuners
+} // namespace cl_gemm
 } // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLGEMMRESHAPEDCONFIGURATION_H__ */
+#endif /*__ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATION_H__ */
diff --git a/arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.h b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
index c452e159cf..a0aae190e8 100644
--- a/arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.h
+++ b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
@@ -21,19 +21,33 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __ARM_COMPUTE_CLGEMMRESHAPEDCONFIGURATIONBIFROST_H__
-#define __ARM_COMPUTE_CLGEMMRESHAPEDCONFIGURATIONBIFROST_H__
+#ifndef __ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONBIFROST_H__
+#define __ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONBIFROST_H__
 
-#include "arm_compute/runtime/CL/ICLGEMMReshapedConfiguration.h"
+#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
 
 namespace arm_compute
 {
 namespace cl_gemm
 {
-/** Bifrost based OpenCL GEMM reshaped configuration */
-class CLGEMMReshapedConfigurationBifrost final : public ICLGEMMReshapedConfiguration
+/** Bifrost based OpenCL GEMMReshaped configuration */
+class CLGEMMReshapedKernelConfigurationBifrost final : public ICLGEMMKernelConfiguration
 {
 public:
+    /** Constructor
+     *
+     * @param[in] arch GPU target
+     */
+    CLGEMMReshapedKernelConfigurationBifrost(GPUTarget arch);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMReshapedKernelConfigurationBifrost(const CLGEMMReshapedKernelConfigurationBifrost &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMReshapedKernelConfigurationBifrost &operator=(const CLGEMMReshapedKernelConfigurationBifrost &) = delete;
+    /** Default Move Constructor. */
+    CLGEMMReshapedKernelConfigurationBifrost(CLGEMMReshapedKernelConfigurationBifrost &&) = default;
+    /** Default move assignment operator */
+    CLGEMMReshapedKernelConfigurationBifrost &operator=(CLGEMMReshapedKernelConfigurationBifrost &&) = default;
+
     // Inherited overridden method
     std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
@@ -45,4 +59,4 @@ private:
 };
 } // namespace cl_gemm
 } // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLGEMMRESHAPEDCONFIGURATIONBIFROST_H__ */
+#endif /*__ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONBIFROST_H__ */
diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
new file mode 100644
index 0000000000..b9bf150c4f
--- /dev/null
+++ b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATION_H__
+#define __ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATION_H__
+
+#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
+#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** CLGEMMReshapedOnlyRHS factory class */
+class CLGEMMReshapedOnlyRHSKernelConfigurationFactory final
+{
+public:
+    /** Static method to call the CLGEMMReshapedOnlyRHS kernel configuration class accordingly with the GPU architecture
+     *
+     * @param[in] arch GPU target
+     *
+     * @return CLGEMMReshapedOnlyRHS kernel configuration class
+     */
+    static std::unique_ptr<ICLGEMMKernelConfiguration> create(GPUTarget arch)
+    {
+        switch(get_arch_from_target(arch))
+        {
+            case GPUTarget::BIFROST:
+                return support::cpp14::make_unique<CLGEMMReshapedOnlyRHSKernelConfigurationBifrost>(arch);
+            default:
+                return nullptr;
+        }
+    }
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATION_H__ */
diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
new file mode 100644
index 0000000000..3bed118f21
--- /dev/null
+++ b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONBIFROST_H__
+#define __ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONBIFROST_H__
+
+#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** Bifrost based OpenCL GEMMReshapedOnlyRHS configuration */
+class CLGEMMReshapedOnlyRHSKernelConfigurationBifrost final : public ICLGEMMKernelConfiguration
+{
+public:
+    /** Constructor
+     *
+     * @param[in] arch GPU target
+     */
+    CLGEMMReshapedOnlyRHSKernelConfigurationBifrost(GPUTarget arch);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMReshapedOnlyRHSKernelConfigurationBifrost(const CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &operator=(const CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &) = delete;
+    /** Default Move Constructor. */
+    CLGEMMReshapedOnlyRHSKernelConfigurationBifrost(CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &&) = default;
+    /** Default move assignment operator */
+    CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &operator=(CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &&) = default;
+
+    // Inherited overridden method
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONBIFROST_H__ */
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 0d07266403..384bd460a0 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -887,23 +887,20 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
 {
     ARM_COMPUTE_ERROR_ON_MSG(input0.num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
 
+    const bool reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d();
     const bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d() != 0;
     const int  depth_output_gemm3d      = reinterpret_output_as_3d ? gemm_info.depth_output_gemm3d() : 1;
 
     // If the output of GEMM has to be reinterpreted as 3D, the number of input0 rows (M) is obtained collapsing the second and third
     // dimension of the output tensor
-    const int dim0 = gemm_info.n();
-    const int dim1 = gemm_info.m() / depth_output_gemm3d;
-    const int dim2 = input0.tensor_shape()[2];
-    const int dim3 = input0.tensor_shape()[3];
+    const int batch_size = reinterpret_input_as_3d ? input0.tensor_shape()[3] : input0.tensor_shape()[2];
 
     TensorShape output_shape{ input0.tensor_shape() };
 
-    output_shape.set(0, dim0);
-    output_shape.set(1, dim1);
-    output_shape.set(2, reinterpret_output_as_3d ? depth_output_gemm3d : dim2);
-    output_shape.set(3, reinterpret_output_as_3d ? dim2 : dim3);
-    output_shape.set(4, reinterpret_output_as_3d ? dim3 : 1);
+    output_shape.set(0, gemm_info.n());
+    output_shape.set(1, gemm_info.m() / depth_output_gemm3d);
+    output_shape.set(2, reinterpret_output_as_3d ? depth_output_gemm3d : batch_size);
+    output_shape.set(3, reinterpret_output_as_3d ? batch_size : 1);
 
     return output_shape;
 }
diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h
index 0bad446551..8c462fa4cb 100644
--- a/arm_compute/runtime/CL/functions/CLGEMM.h
+++ b/arm_compute/runtime/CL/functions/CLGEMM.h
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
 #include "arm_compute/runtime/CL/CLMemoryGroup.h"
@@ -40,10 +41,11 @@ class ICLTensor;
 
 /** Basic function to execute GEMM on OpenCL. This function calls the following OpenCL kernels:
  *
- *  -# @ref CLGEMMReshapeLHSMatrixKernel (only if the reshaped GEMM is selected by the heuristic model)
- *  -# @ref CLGEMMReshapeRHSMatrixKernel (only if the reshaped GEMM is selected by the heuristic model)
- *  -# @ref CLGEMMMatrixMultiplyKernel (if GPU target is NOT G76 or if the reshaped GEMM is NOT selected)
- *  -# @ref CLGEMMMatrixMultiplyReshapedKernel (only if the reshaped GEMM is selected by the heuristic model and the GPU target IS Mali-G76)
+ *  -# @ref CLGEMMReshapeLHSMatrixKernel (only if the RESHAPED_V1 is selected by the heuristic model)
+ *  -# @ref CLGEMMReshapeRHSMatrixKernel (only if either the RESHAPED_V1 or RESHAPED_ONLY_RHS is selected by the select_gemm_type method())
+ *  -# @ref CLGEMMMatrixMultiplyKernel (only if either the NATIVE or RESHAPED_V1 is selected by the select_gemm_type method())
+ *  -# @ref CLGEMMMatrixMultiplyReshapedKernel (only if RESHAPED_V1 is selected by the select_gemm_type method())
+ *  -# @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel (only if RESHAPED_ONLY_RHS is selected by the select_gemm_type method())
  *  -# @ref CLGEMMMatrixAdditionKernel (if c != nullptr and beta != 0.0)
  *
  */
@@ -102,20 +104,41 @@ public:
     void prepare() override;
 
 private:
-    CLMemoryGroup                      _memory_group;
-    CLGEMMMatrixMultiplyKernel         _mm_kernel;
-    CLGEMMMatrixAdditionKernel         _ma_kernel;
-    CLGEMMReshapeLHSMatrixKernel       _reshape_lhs_kernel;
-    CLGEMMReshapeRHSMatrixKernel       _reshape_rhs_kernel;
-    CLGEMMMatrixMultiplyReshapedKernel _mm_reshaped_kernel;
-    CLTensor                           _tmp_a;
-    CLTensor                           _tmp_b;
-    const ICLTensor                   *_original_b;
-    bool                               _is_interleaved_transposed;
-    bool                               _run_addition;
-    bool                               _reshape_b_only_on_first_run;
-    bool                               _is_prepared;
-    bool                               _is_new_gemm_reshaped; // Remove when COMPMID-1892 is completed
+    enum class GEMMType
+    {
+        NATIVE,
+        RESHAPED_V1,
+        RESHAPED_V2,
+        RESHAPED_ONLY_RHS
+    };
+
+    // TODO (COMPMID-2095)
+    static GEMMType select_gemm_type(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target);
+
+    void configure_native(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    void configure_reshaped_v1(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    void configure_reshaped_v2(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    void configure_reshaped_only_rhs(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info);
+
+    static Status validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    static Status validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    static Status validate_reshaped_v2(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    static Status validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+
+    CLMemoryGroup                             _memory_group;
+    CLGEMMMatrixMultiplyKernel                _mm_kernel;
+    CLGEMMMatrixAdditionKernel                _ma_kernel;
+    CLGEMMReshapeLHSMatrixKernel              _reshape_lhs_kernel;
+    CLGEMMReshapeRHSMatrixKernel              _reshape_rhs_kernel;
+    CLGEMMMatrixMultiplyReshapedKernel        _mm_reshaped_kernel;
+    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel _mm_reshaped_only_rhs_kernel;
+    CLTensor                                  _tmp_a;
+    CLTensor                                  _tmp_b;
+    const ICLTensor                          *_original_b;
+    bool                                      _run_addition;
+    bool                                      _reshape_b_only_on_first_run;
+    bool                                      _is_prepared;
+    GEMMType                                  _gemm_type;
 };
 } // namespace arm_compute
 
diff --git a/src/core/CL/gemm/CLGEMMHelpers.cpp b/src/core/CL/gemm/CLGEMMHelpers.cpp
new file mode 100644
index 0000000000..4597d79d43
--- /dev/null
+++ b/src/core/CL/gemm/CLGEMMHelpers.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
+                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose)
+{
+    GEMMLHSMatrixInfo lhs_info;
+    GEMMRHSMatrixInfo rhs_info;
+
+    // Configure GEMMLHSMatrixInfo
+    lhs_info.m0         = m0;
+    lhs_info.k0         = k0;
+    lhs_info.v0         = ((m / (lhs_info.m0 * v0)) == 0) ? 1 : v0;
+    lhs_info.interleave = lhs_interleave;
+    lhs_info.transpose  = lhs_transpose;
+
+    // Configure GEMMRHSMatrixInfo
+    rhs_info.n0         = n0;
+    rhs_info.k0         = lhs_info.k0;
+    rhs_info.h0         = ((n / (rhs_info.n0 * h0)) == 0) ? 1 : h0;
+    rhs_info.interleave = rhs_interleave;
+    rhs_info.transpose  = rhs_transpose;
+
+    return std::make_pair(lhs_info, rhs_info);
+}
+} // namespace cl_gemm
+} // namespace arm_compute
+\ No newline at end of file
diff --git a/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp
new file mode 100644
index 0000000000..b791c1cda5
--- /dev/null
+++ b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
+#include "arm_compute/core/GPUTarget.h"
+
+#include <map>
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+CLGEMMReshapedKernelConfigurationBifrost::CLGEMMReshapedKernelConfigurationBifrost(GPUTarget arch)
+    : ICLGEMMKernelConfiguration(arch)
+{
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfigurationBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+    ARM_COMPUTE_ERROR_ON(data_type != DataType::F32 && data_type != DataType::QASYMM8);
+    ARM_COMPUTE_UNUSED(data_type);
+
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (CLGEMMReshapedKernelConfigurationBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+    // Configurations for Mali-G76
+    static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_configs_G76 =
+    {
+        { DataType::F32, &CLGEMMReshapedKernelConfigurationBifrost::configure_G76_f32 },
+        { DataType::QASYMM8, &CLGEMMReshapedKernelConfigurationBifrost::configure_G76_u8 }
+    };
+
+    // Configurations for Mali-G7x
+    static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_configs_G7x =
+    {
+        { DataType::F32, &CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_f32 },
+        { DataType::QASYMM8, &CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_u8 }
+    };
+
+    switch(_target)
+    {
+        case GPUTarget::G76:
+            return (this->*gemm_configs_G76[data_type])(m, n, k, b);
+        default:
+            return (this->*gemm_configs_G7x[data_type])(m, n, k, b);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(n <= 4)
+    {
+        return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 5, 4, 4, 2, 16, false, true, false, true);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(dot8_supported(CLKernelLibrary::get().get_device()))
+    {
+        if(n <= 4)
+        {
+            return configure_lhs_rhs_info(m, n, 4, 2, 16, 2, 2, true, false, false, true);
+        }
+        else
+        {
+            return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, true, false, false, true);
+        }
+    }
+    else
+    {
+        if(n <= 4)
+        {
+            return configure_lhs_rhs_info(m, n, 4, 2, 8, 2, 2, true, false, false, true);
+        }
+        else
+        {
+            return configure_lhs_rhs_info(m, n, 6, 4, 4, 2, 2, true, true, false, true);
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfigurationBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(n <= 4)
+    {
+        return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 4, 2, 8, 16, false, false, false, true);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfigurationBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(n <= 4)
+    {
+        return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, false, false, false, true);
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, false, true, false, true);
+    }
+}
+} // namespace cl_gemm
+} // namespace arm_compute
+\ No newline at end of file
diff --git a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp
new file mode 100644
index 0000000000..f696f0b253
--- /dev/null
+++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
+#include "arm_compute/core/GPUTarget.h"
+
+#include <map>
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::CLGEMMReshapedOnlyRHSKernelConfigurationBifrost(GPUTarget arch)
+    : ICLGEMMKernelConfiguration(arch)
+{
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+    ARM_COMPUTE_ERROR_ON(data_type != DataType::F32 && data_type != DataType::QASYMM8);
+    ARM_COMPUTE_UNUSED(data_type);
+
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::*)(unsigned int m, unsigned int n, unsigned int k,
+                                             unsigned int b);
+
+    // Configurations for Mali-G76
+    static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_configs_G76 =
+    {
+        { DataType::F32, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_f32 },
+        { DataType::QASYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_u8 }
+    };
+
+    // Configurations for Mali-G7x
+    static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_configs_G7x =
+    {
+        { DataType::F32, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_f32 },
+        { DataType::QASYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_u8 }
+    };
+
+    switch(_target)
+    {
+        case GPUTarget::G76:
+            return (this->*gemm_configs_G76[data_type])(m, n, k, b);
+        default:
+            return (this->*gemm_configs_G7x[data_type])(m, n, k, b);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(m == 1)
+    {
+        if(n > 2048)
+        {
+            const unsigned int h0 = std::max(n / 4, static_cast<unsigned int>(1));
+            return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true);
+        }
+        else
+        {
+            const unsigned int h0 = std::max(n / 2, static_cast<unsigned int>(1));
+            return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
+        }
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 4, false, true, false, true);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(m == 1)
+    {
+        const unsigned int h0 = std::max(n / 2, static_cast<unsigned int>(1));
+        return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(dot8_supported(CLKernelLibrary::get().get_device()))
+    {
+        if(m == 1)
+        {
+            const unsigned int h0 = std::max(n / 2, static_cast<unsigned int>(1));
+            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
+        }
+        else
+        {
+            const unsigned int h0 = std::max(n / 4, static_cast<unsigned int>(1));
+            return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, false, true, false, true);
+        }
+    }
+    else
+    {
+        if(m == 1)
+        {
+            if(n > 2048)
+            {
+                const unsigned int h0 = std::max(n / 4, static_cast<unsigned int>(1));
+                return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, false, true, false, true);
+            }
+            else
+            {
+                const unsigned int h0 = std::max(n / 2, static_cast<unsigned int>(1));
+                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
+            }
+        }
+        else
+        {
+            const unsigned int h0 = std::max(n / 4, static_cast<unsigned int>(1));
+            return configure_lhs_rhs_info(m, n, 4, 1, 16, 1, h0, false, true, false, true);
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if(m == 1)
+    {
+        const unsigned int h0 = std::max(n / 2, static_cast<unsigned int>(1));
+        return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, 2, false, true, false, true);
+    }
+}
+} // namespace cl_gemm
+} // namespace arm_compute
+\ No newline at end of file
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
index af06fecd00..24372657f5 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
@@ -68,20 +68,23 @@ Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1,
     const int n = gemm_info.n();
     const int k = gemm_info.k();
 
-    TensorShape tensor_shape0{ input0->tensor_shape() };
-    tensor_shape0.set(0, k);
-    tensor_shape0.set(1, m);
-
     TensorShape tensor_shape1{ input1->tensor_shape() };
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
-    const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
     const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
 
     const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info0);
+    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
+    if(gemm_info.reinterpret_input_as_3d())
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast<unsigned int>(m));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
+    }
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
 
     if(output->total_size() != 0)
@@ -99,6 +102,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
 {
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d();
     bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d() != 0);
 
     Window win{};
@@ -107,6 +111,10 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
 
     // In case both input and output have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
+    {
+        reinterpret_output_as_3d = false;
+    }
 
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, gemm_info)));
@@ -147,7 +155,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
     window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
                      update_window_and_padding(win_out, output_access);              // window used to update the padding requirements of output tensor
 
-    output_access.set_valid_region(win_out, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+    output_access.set_valid_region(win_out, ValidRegion(Coordinates(), output->tensor_shape()));
 
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
@@ -181,6 +189,11 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const ICLTensor *input
 
     // In case both input and output have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+    {
+        _reinterpret_input_as_3d  = false;
+        _reinterpret_output_as_3d = false;
+    }
 
     // Check if we need to slide the matrix B
     const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
@@ -204,7 +217,7 @@ void CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::configure(const ICLTensor *input
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
     build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
     build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m()));
+    build_opts.add_option("-DM=" + support::cpp11::to_string(input0->info()->dimension(1)));
     build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
     build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k()));
     build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 2ac6f815a4..60bfbf24e5 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -23,7 +23,10 @@
  */
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 
+#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h"
+#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/Helpers.h"
@@ -33,7 +36,6 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 
 namespace arm_compute
@@ -41,104 +43,109 @@ namespace arm_compute
 using namespace arm_compute::misc::shape_calculator;
 using namespace arm_compute::cl_gemm;
 
-namespace
+CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)),
+      _mm_kernel(),
+      _ma_kernel(),
+      _reshape_lhs_kernel(),
+      _reshape_rhs_kernel(),
+      _mm_reshaped_kernel(),
+      _mm_reshaped_only_rhs_kernel(),
+      _tmp_a(),
+      _tmp_b(),
+      _original_b(nullptr),
+      _run_addition(false),
+      _reshape_b_only_on_first_run(false),
+      _is_prepared(false),
+      _gemm_type(GEMMType::NATIVE)
 {
-inline bool is_interleaved_transposed(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+}
+
+CLGEMM::GEMMType CLGEMM::select_gemm_type(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
 {
-    bool flag = true;
+    GEMMType gemm_type = GEMMType::RESHAPED_V1;
 
     if(gpu_target_is_in(gpu_target, GPUTarget::G52, GPUTarget::G52LIT, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76))
     {
-        if((m > 1) && n < 16)
+        if((m > 1) && (n < 16))
         {
-            flag = true;
+            gemm_type = GEMMType::RESHAPED_V1;
+        }
+        else if((m == 1) && (data_type == DataType::F32))
+        {
+            gemm_type = GEMMType::RESHAPED_ONLY_RHS;
         }
         else
         {
             // COMPMID-852
-            if(k > 256 && m > 4 && is_data_type_float(data_type) && reshape_b_only_on_first_run)
+            if((k > 256) && (m > 4) && is_data_type_float(data_type) && reshape_b_only_on_first_run)
             {
                 constexpr float alpha = 3.2f;
                 constexpr float fact0 = 1.51f;
                 constexpr float fact1 = 1.66f;
                 constexpr float ops   = 12.0f;
                 const float     scale = k > 1024 ? 1.07f : 1.0f;
-                flag                  = alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops);
+                gemm_type             = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? GEMMType::RESHAPED_V1 : GEMMType::NATIVE;
             }
             else
             {
-                flag = false;
+                gemm_type = GEMMType::NATIVE;
             }
         }
+
+        const auto workload = static_cast<float>((m * n) / 20.0f);
+
+        gemm_type = ((workload > 1600.0f) && (gemm_type == GEMMType::RESHAPED_V1) && (data_type == DataType::F32)) ? GEMMType::RESHAPED_V2 : gemm_type;
     }
     else
     {
         // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
-        flag = m != 1 && reshape_b_only_on_first_run;
+        gemm_type = ((m != 1) && reshape_b_only_on_first_run) ? GEMMType::RESHAPED_V1 : GEMMType::NATIVE;
     }
 
-    return flag;
+    return gemm_type;
 }
-} // namespace
 
-CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)),
-      _mm_kernel(),
-      _ma_kernel(),
-      _reshape_lhs_kernel(),
-      _reshape_rhs_kernel(),
-      _mm_reshaped_kernel(),
-      _tmp_a(),
-      _tmp_b(),
-      _original_b(nullptr),
-      _is_interleaved_transposed(false),
-      _run_addition(false),
-      _reshape_b_only_on_first_run(false),
-      _is_prepared(false),
-      _is_new_gemm_reshaped(false)
-{
-}
-
-void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure_native(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+    const unsigned int m          = gemm_info.reinterpret_input_as_3d() ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+    const unsigned int n          = b->info()->dimension(0);
+    const unsigned int k          = a->info()->dimension(0);
+    const GPUTarget    gpu_target = CLScheduler::get().target();
 
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
-
-    // Check if we need to reshape the matrix B only on the first run
-    _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
-    _is_prepared                 = gemm_info.retain_internal_weights();
-    _original_b                  = b;
+    // Set the target for the kernels
+    _mm_kernel.set_target(gpu_target);
 
-    const ICLTensor *matrix_a = a;
-    const ICLTensor *matrix_b = b;
+    GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d());
 
-    // Get the GPU target
-    const GPUTarget gpu_target = CLScheduler::get().target();
+    // Configure and tune matrix multiply kernel
+    _mm_kernel.configure(a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision());
 
-    // Set the target for the kernels
-    _reshape_lhs_kernel.set_target(gpu_target);
-    _mm_kernel.set_target(gpu_target);
+    // Tune kernel statically
+    CLScheduler::get().tune_kernel_static(_mm_kernel);
+}
 
-    // Arguments used by GEMMReshapeInfo
-    // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
-    // in order to know how the matrices have been reshaped
-    DataType           data_type                 = a->info()->data_type();
+void CLGEMM::configure_reshaped_v1(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
     bool               reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
     const unsigned int m                         = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
     const unsigned int n                         = b->info()->dimension(0);
     const unsigned int k                         = a->info()->dimension(0);
-    const unsigned int batch_size                = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
     const int          depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target                = CLScheduler::get().target();
     int                mult_transpose1xW_width   = 1;
     int                mult_interleave4x4_height = 1;
 
+    // Set the target for the kernels
+    _reshape_lhs_kernel.set_target(gpu_target);
+    _mm_kernel.set_target(gpu_target);
+
     if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
     {
         mult_transpose1xW_width   = 4;
         mult_interleave4x4_height = 2;
     }
+
     GEMMRHSMatrixInfo rhs_info;
     rhs_info.n0         = 16 / b->info()->element_size();
     rhs_info.k0         = 1;
@@ -153,112 +160,183 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
     lhs_info.interleave = true;
     lhs_info.transpose  = true;
 
-    // Check if we need to reshape the matrix A and matrix B
-    _is_interleaved_transposed = is_interleaved_transposed(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
+    GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false);
 
-    // Check if we can run the new reshaped GEMM
-    const auto workload   = static_cast<float>((m * n) / 20.0f);
-    _is_new_gemm_reshaped = (workload > 1600.0f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && _is_interleaved_transposed && (data_type == DataType::F32);
+    _memory_group.manage(&_tmp_a);
+    if(!_reshape_b_only_on_first_run)
+    {
+        _memory_group.manage(&_tmp_b);
+    }
 
-    const bool add_matrix_c  = (beta != 0.f && c != nullptr);
-    const bool is_beta_one   = std::abs(1.0f - beta) < 0.00001f;
-    const bool use_fused_add = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !_is_new_gemm_reshaped;
+    // Configure interleave kernel
+    _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, reinterpret_input_as_3d);
 
-    // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
-    if(_is_interleaved_transposed)
-    {
-        reinterpret_input_as_3d = false;
+    // Configure transpose kernel
+    _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
 
-        matrix_a = &_tmp_a;
-        matrix_b = &_tmp_b;
+    // Configure and tune matrix multiply kernel
+    _mm_kernel.configure(&_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision());
 
-        // Manage intermediate buffers
-        _memory_group.manage(&_tmp_a);
-        if(!_reshape_b_only_on_first_run)
-        {
-            _memory_group.manage(&_tmp_b);
-        }
-        // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
+    CLScheduler::get().tune_kernel_static(_mm_kernel);
 
-        if(_is_new_gemm_reshaped)
-        {
-            GEMMLHSMatrixInfo lhs_info;
+    // Allocate intermediate tensors
+    _tmp_a.allocator()->allocate();
+    if(!_reshape_b_only_on_first_run)
+    {
+        _tmp_b.allocator()->allocate();
+    }
+}
 
-            // Pick up the GEMM configuration
-            std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
+void CLGEMM::configure_reshaped_v2(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON(c != nullptr);
+    ARM_COMPUTE_UNUSED(beta);
+    ARM_COMPUTE_UNUSED(c);
+
+    DataType           data_type               = a->info()->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+    const unsigned int n                       = b->info()->dimension(0);
+    const unsigned int k                       = a->info()->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
 
-            _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
-            _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+    // Set the target for the kernels
+    _reshape_lhs_kernel.set_target(gpu_target);
+    _mm_kernel.set_target(gpu_target);
 
-            // Configure and tune matrix multiply kernel
-            _mm_reshaped_kernel.configure(matrix_a, matrix_b, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
-                                                                                                                 depth_output_gemm3d, reinterpret_input_as_3d));
-        }
-        else
-        {
-            // Configure interleave kernel
-            _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
-            // Configure transpose kernel
-            _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
-        }
+    GEMMReshapeInfo reshape_info(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+    // Manage intermediate buffers
+    _memory_group.manage(&_tmp_a);
+    if(!_reshape_b_only_on_first_run)
+    {
+        _memory_group.manage(&_tmp_b);
     }
+    // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
+
+    GEMMLHSMatrixInfo lhs_info{};
+    GEMMRHSMatrixInfo rhs_info{};
+
+    // Pick up the GEMM configuration
+    std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedKernelConfigurationFactory::create(gpu_target);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
+
+    // Configure lhs_info and rhs_info
+    std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+    _reshape_lhs_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+    _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+
+    // Configure and tune matrix multiply kernel
+    _mm_reshaped_kernel.configure(&_tmp_a, &_tmp_b, output, alpha, lhs_info, rhs_info, reshape_info);
 
-    if(!_is_new_gemm_reshaped)
+    // Allocate intermediate tensors
+    _tmp_a.allocator()->allocate();
+    if(!_reshape_b_only_on_first_run)
     {
-        // Configure and tune matrix multiply kernel
-        _mm_kernel.configure(matrix_a, matrix_b, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta, _is_interleaved_transposed,
-                             GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d),
-                             gemm_info.fp_mixed_precision());
-        CLScheduler::get().tune_kernel_static(_mm_kernel);
+        _tmp_b.allocator()->allocate();
     }
+}
+
+void CLGEMM::configure_reshaped_only_rhs(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON(c != nullptr);
+    ARM_COMPUTE_UNUSED(beta);
+    ARM_COMPUTE_UNUSED(c);
+
+    DataType           data_type               = a->info()->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+    const unsigned int n                       = b->info()->dimension(0);
+    const unsigned int k                       = a->info()->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->info()->dimension(3) : a->info()->dimension(2);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+
+    // Set the target for the kernels
+    _mm_kernel.set_target(gpu_target);
+
+    GEMMReshapeInfo reshape_info(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
 
-    if(_is_interleaved_transposed)
+    // Manage intermediate buffers
+    if(!_reshape_b_only_on_first_run)
     {
-        // Allocate intermediate tensors
-        _tmp_a.allocator()->allocate();
-        if(!_reshape_b_only_on_first_run)
-        {
-            _tmp_b.allocator()->allocate();
-        }
+        _memory_group.manage(&_tmp_b);
     }
 
-    // Configure matrix addition kernel
-    if(add_matrix_c && !use_fused_add)
+    GEMMLHSMatrixInfo lhs_info{};
+    GEMMRHSMatrixInfo rhs_info{};
+
+    // Pick up the GEMM configuration
+    std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
+
+    // Configure lhs_info and rhs_info
+    std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+    _reshape_rhs_kernel.configure(b, &_tmp_b, rhs_info);
+
+    // Configure and tune matrix multiply kernel
+    _mm_reshaped_only_rhs_kernel.configure(a, &_tmp_b, output, alpha, lhs_info, rhs_info, reshape_info);
+
+    if(!_reshape_b_only_on_first_run)
     {
-        _ma_kernel.configure(c, output, beta);
-        _run_addition = true;
+        _tmp_b.allocator()->allocate();
     }
 }
 
-Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status CLGEMM::validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_UNUSED(output);
 
-    // Check if we need to reshape the matrix B only on the first run
-    const bool reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n                       = b->dimension(0);
+    const unsigned int k                       = a->dimension(0);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const bool         add_c                   = (beta != 0.f && c != nullptr);
+    const bool         is_beta_one             = std::abs(1.0f - beta) < 0.00001f;
+    const bool         fuse_add                = is_beta_one && (c != nullptr && c->num_dimensions() == 1);
+
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+    // Validate matrix multiply
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta,
+                                                                     false, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+
+    if(add_c && !fuse_add)
+    {
+        // Validate matrix addition kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
+    }
 
-    const ITensorInfo *matrix_a_info = a;
-    const ITensorInfo *matrix_b_info = b;
+    return Status{};
+}
+
+Status CLGEMM::validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(output);
 
     TensorInfo tmp_a_info{};
     TensorInfo tmp_b_info{};
 
     // Get the GPU target
-    const GPUTarget gpu_target = CLScheduler::get().target();
-
-    // Arguments used by GEMMReshapeInfo
-    // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
-    // in order to know how the matrices have been reshaped
-    DataType           data_type                 = a->data_type();
-    bool               reinterpret_input_as_3d   = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                         = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const GPUTarget    gpu_target                = CLScheduler::get().target();
+    const unsigned int m                         = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
     const unsigned int n                         = b->dimension(0);
     const unsigned int k                         = a->dimension(0);
-    const unsigned int batch_size                = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
     int                mult_transpose1xW_width   = 1;
     int                mult_interleave4x4_height = 1;
     const int          depth_output_gemm3d       = gemm_info.depth_output_gemm3d();
+    const bool         add_c                     = (beta != 0.f && c != nullptr);
+    const bool         is_beta_one               = std::abs(1.0f - beta) < 0.00001f;
+    const bool         fuse_add                  = is_beta_one && (c != nullptr && c->num_dimensions() == 1);
 
     if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST)
     {
@@ -280,69 +358,224 @@ Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
     lhs_info.interleave = true;
     lhs_info.transpose  = true;
 
-    // Check if we need to reshape the matrix A and matrix B
-    const bool run_interleave_transpose = is_interleaved_transposed(m, n, k, a->data_type(), reshape_b_only_on_first_run, gpu_target);
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false);
 
-    // Check if we can run the new reshaped GEMM
-    const auto workload             = static_cast<float>((m * n) / 20.0f);
-    const bool is_new_gemm_reshaped = (workload > 1600.f) && (get_arch_from_target(gpu_target) == GPUTarget::BIFROST) && run_interleave_transpose && (data_type == DataType::F32);
+    // Validate interleave kernel
+    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
 
-    const bool add_matrix_c  = (beta != 0.f && c != nullptr);
-    const bool is_beta_one   = std::abs(1.0f - beta) < 0.00001f;
-    const bool use_fused_add = is_beta_one && (c != nullptr && c->num_dimensions() == 1) && !is_new_gemm_reshaped;
+    // Validate transpose kernel
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
 
-    // if _is_interleaved_transposed is set, force reinterpret_input_as_3d to be false as the output of CLGEMMInterleaveKernel will be 2D
-    if(run_interleave_transpose)
+    // Validate matrix multiply
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, (add_c && fuse_add) ? c : nullptr, output, alpha, beta,
+                                                                     true, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+
+    if(add_c && !fuse_add)
     {
-        reinterpret_input_as_3d = false;
+        // Validate matrix addition kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
     }
 
-    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, reinterpret_input_as_3d);
+    return Status{};
+}
+
+Status CLGEMM::validate_reshaped_v2(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(output);
+
+    TensorInfo tmp_a_info{};
+    TensorInfo tmp_b_info{};
+
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    DataType           data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n                       = b->dimension(0);
+    const unsigned int k                       = a->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const bool         add_c                   = (beta != 0.f && c != nullptr);
+
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, false);
+
+    GEMMLHSMatrixInfo lhs_info;
+    GEMMRHSMatrixInfo rhs_info;
+
+    // Pick up the GEMM configuration
+    std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedKernelConfigurationFactory::create(gpu_target);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(gemm_config.get());
 
-    if(run_interleave_transpose)
+    // Configure lhs_info and rhs_info
+    std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+    // Validate matrix multiply
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, output, alpha, lhs_info, rhs_info, reshape_info));
+
+    if(add_c)
     {
-        matrix_a_info = &tmp_a_info;
-        matrix_b_info = &tmp_b_info;
+        // Validate matrix addition kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
+    }
 
-        if(is_new_gemm_reshaped)
-        {
-            GEMMLHSMatrixInfo lhs_info;
+    return Status{};
+}
+
+Status CLGEMM::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(output);
+
+    TensorInfo tmp_b_info{};
+
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    const DataType     data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n                       = b->dimension(0);
+    const unsigned int k                       = a->dimension(0);
+    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const bool         add_c                   = (beta != 0.f && c != nullptr);
+
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+    GEMMLHSMatrixInfo lhs_info;
+    GEMMRHSMatrixInfo rhs_info;
+
+    // Pick up the GEMM configuration
+    std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(gemm_config.get());
+
+    // Configure lhs_info and rhs_info
+    std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+    // Validate matrix multiply
+    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::validate(a, &tmp_b_info, output, alpha, lhs_info, rhs_info, reshape_info));
+
+    if(add_c)
+    {
+        // Validate matrix addition kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
+    }
+
+    return Status{};
+}
+
+void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
+
+    // Check if we need to reshape the matrix B only on the first run
+    _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+    _is_prepared                 = gemm_info.retain_internal_weights();
+    _original_b                  = b;
 
-            // Pick up the GEMM configuration
-            std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, data_type);
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->info()->dimension(1) * a->info()->dimension(2)) : a->info()->dimension(1);
+    const unsigned int n                       = b->info()->dimension(0);
+    const unsigned int k                       = a->info()->dimension(0);
 
-            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+    // Select GEMMType
+    _gemm_type = select_gemm_type(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
 
-            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+    const bool is_gemm_v2  = (_gemm_type == GEMMType::RESHAPED_V2) || (_gemm_type == GEMMType::RESHAPED_ONLY_RHS);
+    const bool add_c       = (beta != 0.f && c != nullptr);
+    const bool is_beta_one = std::abs(1.0f - beta) < 0.00001f;
+    const bool fuse_add    = is_beta_one && (c != nullptr && c->info()->num_dimensions() == 1) && !is_gemm_v2;
 
-            // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyReshapedKernel::validate(matrix_a_info, matrix_b_info, output, alpha, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1,
-                                                                                     depth_output_gemm3d, reinterpret_input_as_3d)));
+    switch(_gemm_type)
+    {
+        case GEMMType::NATIVE:
+        {
+            configure_native(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+            break;
         }
-        else
+        case GEMMType::RESHAPED_V1:
+        {
+            configure_reshaped_v1(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+            break;
+        }
+        case GEMMType::RESHAPED_V2:
         {
-            // Validate interleave kernel
-            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
-            // Validate transpose kernel
-            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+            configure_reshaped_v2(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+            break;
+        }
+        case GEMMType::RESHAPED_ONLY_RHS:
+        {
+            configure_reshaped_only_rhs(a, b, (add_c && fuse_add) ? c : nullptr, output, alpha, beta, gemm_info);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("GEMMType not supported");
         }
     }
 
-    if(!is_new_gemm_reshaped)
+    // Configure matrix addition kernel
+    if(add_c && !fuse_add)
     {
-        // Validate matrix multiply
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, (add_matrix_c && !use_fused_add) ? nullptr : c, output, alpha, beta,
-                                                                         run_interleave_transpose, reshape_info, gpu_target, gemm_info.fp_mixed_precision()));
+        _ma_kernel.configure(c, output, beta);
+        _run_addition = true;
     }
+}
 
-    if(add_matrix_c && !use_fused_add)
+Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+{
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n                       = b->dimension(0);
+    const unsigned int k                       = a->dimension(0);
+
+    // Select GEMMType
+    GEMMType gemm_type = select_gemm_type(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run(), gpu_target);
+
+    switch(gemm_type)
     {
-        // Validate matrix addition kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixAdditionKernel::validate(c, output, beta));
+        case GEMMType::NATIVE:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_native(a, b, c, output, alpha, beta, gemm_info));
+            break;
+        }
+        case GEMMType::RESHAPED_V1:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c, output, alpha, beta, gemm_info));
+            break;
+        }
+        case GEMMType::RESHAPED_V2:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v2(a, b, c, output, alpha, beta, gemm_info));
+            break;
+        }
+        case GEMMType::RESHAPED_ONLY_RHS:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c, output, alpha, beta, gemm_info));
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_RETURN_ERROR_MSG("GEMMType not supported");
+        }
     }
 
     return Status{};
@@ -354,26 +587,57 @@ void CLGEMM::run()
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_is_interleaved_transposed)
+    // Run matrix multiply kernel
+    switch(_gemm_type)
     {
-        // Run interleave kernel
-        CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
+        case GEMMType::NATIVE:
+        {
+            CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+            break;
+        }
+        case GEMMType::RESHAPED_V1:
+        {
+            // Run interleave kernel
+            CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
 
-        if(!_reshape_b_only_on_first_run)
+            if(!_reshape_b_only_on_first_run)
+            {
+                // Run transpose kernel
+                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+            }
+
+            CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+            break;
+        }
+        case GEMMType::RESHAPED_V2:
         {
-            // Run transpose kernel
-            CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+            // Run interleave kernel
+            CLScheduler::get().enqueue(_reshape_lhs_kernel, false);
+
+            if(!_reshape_b_only_on_first_run)
+            {
+                // Run transpose kernel
+                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+            }
+
+            CLScheduler::get().enqueue(_mm_reshaped_kernel, !_run_addition);
+            break;
         }
-    }
+        case GEMMType::RESHAPED_ONLY_RHS:
+        {
+            if(!_reshape_b_only_on_first_run)
+            {
+                // Run transpose kernel
+                CLScheduler::get().enqueue(_reshape_rhs_kernel, false);
+            }
 
-    // Run matrix multiply kernel
-    if(_is_new_gemm_reshaped)
-    {
-        CLScheduler::get().enqueue(_mm_reshaped_kernel, !_run_addition);
-    }
-    else
-    {
-        CLScheduler::get().enqueue(_mm_kernel, !_run_addition);
+            CLScheduler::get().enqueue(_mm_reshaped_only_rhs_kernel, !_run_addition);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("GEMMType not supported");
+        }
     }
 
     // Run matrix addition kernel
@@ -387,7 +651,7 @@ void CLGEMM::prepare()
 {
     if(!_is_prepared)
     {
-        if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+        if(_gemm_type != GEMMType::NATIVE && _reshape_b_only_on_first_run)
         {
             // Run transpose kernel and mark original weights tensor as unused
             _tmp_b.allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index c0bd85dcb5..c447cb8778 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -31,7 +32,6 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfiguration.h"
 
 namespace arm_compute
 {
@@ -122,12 +122,12 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
         }
 
         // Pick up the GEMM configuration
-        std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+        std::tie(lhs_info, rhs_info) = CLGEMMReshapedKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
 
-        // Configure interleave kernel
+        // Configure reshape LHS kernel
         _mtx_a_reshape_kernel.configure(a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
 
-        // Configure transpose kernel
+        // Configure reshape RHS kernel
         _mtx_b_reshape_kernel.configure(b, &_tmp_b, rhs_info);
     }
 
@@ -236,6 +236,9 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
     GEMMRHSMatrixInfo rhs_info;
     GEMMLHSMatrixInfo lhs_info;
 
+    // Get the GPU target
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
     const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
     const unsigned int n                       = b->dimension(0);
@@ -259,14 +262,13 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
         matrix_b_info = &tmp_b_info;
 
         // Pick up the GEMM configuration
-        std::tie(lhs_info, rhs_info) = CLGEMMReshapedConfigurationFactory::create()->configure(m, n, k, batch_size, DataType::QASYMM8);
+        std::tie(lhs_info, rhs_info) = CLGEMMReshapedKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
 
-        // Validate interleave kernel
+        // Validate reshape LHS kernel
         auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeLHSMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
 
-        // Validate transpose kernel
-
+        // Validate reshape RHS kernel
         auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMReshapeRHSMatrixKernel::validate(b, &tmp_b_info, rhs_info));
     }
diff --git a/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp b/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp
deleted file mode 100644
index cd97849712..0000000000
--- a/src/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/gemm_reshaped/CLGEMMReshapedConfigurationBifrost.h"
-
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-#include <utility>
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-namespace
-{
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_gemm_reshaped(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
-                                                                        bool lhs_interleave, bool rhs_interleave)
-{
-    GEMMLHSMatrixInfo lhs_info;
-    GEMMRHSMatrixInfo rhs_info;
-
-    // Configure GEMMLHSMatrixInfo
-    lhs_info.m0         = m0;
-    lhs_info.k0         = k0;
-    lhs_info.v0         = ((m / (lhs_info.m0 * v0)) == 0) ? 1 : v0;
-    lhs_info.interleave = lhs_interleave;
-    lhs_info.transpose  = false;
-
-    // Configure GEMMRHSMatrixInfo
-    rhs_info.n0         = n0;
-    rhs_info.k0         = lhs_info.k0;
-    rhs_info.h0         = ((n / (rhs_info.n0 * h0)) == 0) ? 1 : h0;
-    rhs_info.interleave = rhs_interleave;
-    rhs_info.transpose  = true;
-
-    return std::make_pair(lhs_info, rhs_info);
-}
-
-} // namespace
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
-{
-    ARM_COMPUTE_ERROR_ON(data_type != DataType::F32 && data_type != DataType::QASYMM8);
-    ARM_COMPUTE_UNUSED(data_type);
-
-    const GPUTarget gpu_target = CLScheduler::get().target();
-
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (CLGEMMReshapedConfigurationBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-
-    // Configurations for Mali-G76
-    static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_reshaped_configs_G76 =
-    {
-        { DataType::F32, &CLGEMMReshapedConfigurationBifrost::configure_G76_f32 },
-        { DataType::QASYMM8, &CLGEMMReshapedConfigurationBifrost::configure_G76_u8 }
-    };
-
-    // Configurations for Mali-G7x
-    static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_reshaped_configs_G7x =
-    {
-        { DataType::F32, &CLGEMMReshapedConfigurationBifrost::configure_G7x_f32 },
-        { DataType::QASYMM8, &CLGEMMReshapedConfigurationBifrost::configure_G7x_u8 }
-    };
-
-    switch(gpu_target)
-    {
-        case GPUTarget::G76:
-            return (this->*gemm_reshaped_configs_G76[data_type])(m, n, k, b);
-        default:
-            return (this->*gemm_reshaped_configs_G7x[data_type])(m, n, k, b);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(n <= 4)
-    {
-        return configure_gemm_reshaped(m, n, 4, 2, 8, 16, 16, true, false);
-    }
-    else
-    {
-        return configure_gemm_reshaped(m, n, 5, 4, 4, 2, 16, false, true);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
-    {
-        if(n <= 4)
-        {
-            return configure_gemm_reshaped(m, n, 4, 2, 16, 2, 2, true, false);
-        }
-        else
-        {
-            return configure_gemm_reshaped(m, n, 4, 4, 16, 2, 2, true, false);
-        }
-    }
-    else
-    {
-        if(n <= 4)
-        {
-            return configure_gemm_reshaped(m, n, 4, 2, 8, 2, 2, true, false);
-        }
-        else
-        {
-            return configure_gemm_reshaped(m, n, 6, 4, 4, 2, 2, true, true);
-        }
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(n <= 4)
-    {
-        return configure_gemm_reshaped(m, n, 4, 2, 8, 16, 16, true, false);
-    }
-    else
-    {
-        return configure_gemm_reshaped(m, n, 4, 4, 2, 8, 16, false, false);
-    }
-}
-
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedConfigurationBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
-{
-    ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
-
-    if(n <= 4)
-    {
-        return configure_gemm_reshaped(m, n, 4, 2, 16, 4, 1, false, false);
-    }
-    else
-    {
-        return configure_gemm_reshaped(m, n, 4, 4, 16, 2, 2, false, true);
-    }
-}
-} // namespace cl_gemm
-} // namespace arm_compute
-\ No newline at end of file
author	Gian Marco Iodice <gianmarco.iodice@arm.com>	2019-03-19 11:44:13 +0000
committer	Gian Marco Iodice <gianmarco.iodice@arm.com>	2019-04-08 14:12:59 +0000
commit	926afe1c8ad6ba6a7bada62a4027fcb79d727104 (patch)
tree	8dcc908a6145de6b02bcea24e3ccd830ba3f5939
parent	8c571692a8236be8605a753e231d240094428be5 (diff)
download	ComputeLibrary-926afe1c8ad6ba6a7bada62a4027fcb79d727104.tar.gz