21 files changed, 521 insertions, 377 deletions
diff --git a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
index 2520eff5de..c0fef45afe 100644
--- a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,7 +47,7 @@ class ICLTensor;
  * \end{array} \right)
  * @f]
  *
- * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
+ * After this operation, the output matrix will have the following shape: [ height * W, ceil(width / W) ] where W = 4 * mult_interleave4x4_height
  */
 class CLGEMMInterleave4x4Kernel : public ICLKernel
 {
@@ -64,18 +64,20 @@ public:
     CLGEMMInterleave4x4Kernel &operator=(CLGEMMInterleave4x4Kernel &&) = default;
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
-     * @param[out] output Output tensor. Data type supported: same as @p input
+     * @param[in]  input                     Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[out] output                    Output tensor. Data type supported: same as @p input
+     * @param[in]  mult_interleave4x4_height (Optional) Multiplication factor for the height of the 4x4 interleave block
      */
-    void configure(const ICLTensor *input, ICLTensor *output);
+    void configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMInterleave4x4Kernel
      *
-     * @param[in] input  Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
-     * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
+     * @param[in] input                     Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in] output                    Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
+     * @param[in] mult_interleave4x4_height Multiplication factor for the height of the 4x4 interleave block
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height);
 
     // Inherited methods overridden
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
index 4e73d7eb13..7260c4a4f6 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,8 +58,10 @@ public:
      * @param[out] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
      * @param[in]  alpha                     Weight of the matrix product
      * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel
+     * @param[in]  reshape_info              (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
+     *
      */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed = true);
+    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyKernel
      *
      * @param[in] input0                    Input tensor containing the Matrix A. Data types supported: QS8/QS16/F16/F32
@@ -67,11 +69,13 @@ public:
      * @param[in] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
      * @param[in] alpha                     Weight of the matrix product
      * @param[in] is_interleaved_transposed True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel
+     * @param[in] reshape_info              GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
      * @param[in] gpu_target                GPU Target
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, GPUTarget gpu_target);
+    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
+                           GPUTarget gpu_target);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
index 8721643c1e..9a3069eab6 100644
--- a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,7 +62,7 @@ class ICLTensor;
  * \end{array} \right)
  * @f]
  *
- * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
+ * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) * mult_transpose1xW_width
  *
  */
 class CLGEMMTranspose1xWKernel : public ICLSimple2DKernel
@@ -70,18 +70,20 @@ class CLGEMMTranspose1xWKernel : public ICLSimple2DKernel
 public:
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
-     * @param[out] output Output tensor. Data type supported: same as @p input
+     * @param[in]  input                   Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[out] output                  Output tensor. Data type supported: same as @p input
+     * @param[in]  mult_transpose1xW_width (Optional) Multiplication factor for the width of the 1xW transposed block
      */
-    void configure(const ICLTensor *input, ICLTensor *output);
+    void configure(const ICLTensor *input, ICLTensor *output, int mult_transpose1xW_width = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMTranspose1xWKernel
      *
-     * @param[in] input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
-     * @param[in] output Output tensor. Data type supported: same as @p input.
+     * @param[in] input                   Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in] output                  Output tensor. Data type supported: same as @p input.
+     * @param[in] mult_transpose1xW_width Multiplication factor for the width of the 1xW transposed block
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int mult_transpose1xW_width);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 5402e358b5..5197000bf9 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -824,13 +824,95 @@ private:
     const unsigned int _num_kernels;
 };
 
-/** GEMM Information class. This class stores the necessary information to compute GEMM functions */
+/** GEMM reshape information class. This class stores the necessary information about matrix A and matrix B reshape.
+ *
+ * The matrix A can only be reshaped through @ref CLGEMMInterleave4x4Kernel or  @ref NEGEMMInterleave4x4Kernel or  @ref GCGEMMInterleave4x4Kernel
+ * Note: Optionally just for @ref CLGEMMInterleave4x4Kernel is it possible to set mult_interleave4x4_height, the multiplication factor for the height of the 4x4 interleaved block
+ *
+ * The matrix B can only be reshaped through @ref CLGEMMTranspose1xWKernel or  @ref NEGEMMTranspose1xWKernel or  @ref GCGEMMTranspose1xWKernel
+ * Note: Optionally just for @ref CLGEMMTranspose1xWKernel is it possible to set mult_transpose1xW_width, the multiplication factor for the width of the 1xW transposed block
+ *
+ */
+class GEMMReshapeInfo final
+{
+public:
+    /** Default constructor */
+    GEMMReshapeInfo()
+        : _m(1), _n(1), _k(1), _mult_transpose1xW_width(1), _mult_interleave4x4_height(1)
+    {
+    }
+    /** Constructor
+     *
+     * @param[in] m                         Number of matrix A rows
+     * @param[in] n                         Number of matrix B columns
+     * @param[in] k                         Number of matrix A columns or matrix B rows
+     * @param[in] mult_transpose1xW_width   (Optional) Multiplication factor for the width of the 1xW transposed block
+     * @param[in] mult_interleave4x4_height (Optional) Multiplication factor for the height of the 4x4 interleaved block
+     */
+    GEMMReshapeInfo(int m, int n, int k, int mult_transpose1xW_width = 1, int mult_interleave4x4_height = 1)
+        : _m(m), _n(n), _k(k), _mult_transpose1xW_width(mult_transpose1xW_width), _mult_interleave4x4_height(mult_interleave4x4_height)
+    {
+    }
+    /** Number of matrix A rows
+     *
+     * @return the number of matrix A rows
+     */
+    int m() const
+    {
+        return _m;
+    }
+    /** Number of matrix B columns
+     *
+     * @return the number of matrix B columns
+     */
+    int n() const
+    {
+        return _n;
+    }
+    /** Number of matrix A columns or matrix B rows
+     *
+     * @return the number of matrix A columns or matrix B rows
+     */
+    int k() const
+    {
+        return _k;
+    }
+    /** Multiplication factor for the width of the 1xW transposed block
+     *
+     * @return the multiplication factor for the width of the 1xW transposed block
+     */
+    int mult_transpose1xW_width() const
+    {
+        return _mult_transpose1xW_width;
+    }
+    /** Multiplication factor for the height of the 4x4 interleaved block
+     *
+     * @return the multiplication factor for the height of the 4x4 interleaved block
+     */
+    int mult_interleave4x4_height() const
+    {
+        return _mult_interleave4x4_height;
+    }
+
+private:
+    const int _m;
+    const int _n;
+    const int _k;
+    const int _mult_transpose1xW_width;
+    const int _mult_interleave4x4_height;
+};
+
+/** GEMM information class. This class stores the necessary information to compute GEMM functions
+ *
+ * This object also contains the information about how matrix A and matrix B have been reshaped
+ *
+ */
 class GEMMInfo
 {
 public:
     /** Default constructor */
     GEMMInfo()
-        : _is_a_reshaped(false), _is_b_reshaped(false), _reshape_b_only_on_first_run(false)
+        : _is_a_reshaped(false), _is_b_reshaped(false), _reshape_b_only_on_first_run(false), _reshape_info()
     {
     }
     /** Constructor
@@ -838,9 +920,10 @@ public:
      * @param[in] is_a_reshaped               True if the matrix A has been reshaped
      * @param[in] is_b_reshaped               True if the matrix B has been reshaped
      * @param[in] reshape_b_only_on_first_run Reshape matrix B only for the first run
+     * @param[in] reshape_info                (Optional) GEMM reshape information object
      */
-    GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run)
-        : _is_a_reshaped(is_a_reshaped), _is_b_reshaped(is_b_reshaped), _reshape_b_only_on_first_run(reshape_b_only_on_first_run)
+    GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo())
+        : _is_a_reshaped(is_a_reshaped), _is_b_reshaped(is_b_reshaped), _reshape_b_only_on_first_run(reshape_b_only_on_first_run), _reshape_info(reshape_info)
     {
     }
     /** Flag which specifies if the matrix A has been reshaped
@@ -869,11 +952,20 @@ public:
     {
         return _reshape_b_only_on_first_run;
     };
+    /** GEMMReshapeInfo object which stores the necessary information to understand how the matrix A and matrix B have been reshaped
+     *
+     * @return the GEMMReshapeInfo object
+     */
+    const GEMMReshapeInfo &reshape_info() const
+    {
+        return _reshape_info;
+    }
 
 private:
-    const bool _is_a_reshaped;
-    const bool _is_b_reshaped;
-    const bool _reshape_b_only_on_first_run;
+    const bool      _is_a_reshaped;
+    const bool      _is_b_reshaped;
+    const bool      _reshape_b_only_on_first_run;
+    GEMMReshapeInfo _reshape_info;
 };
 
 /** IO formatting information class*/
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 61834b88a9..6ecfdf0323 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,12 +39,14 @@ inline TensorShape compute_permutation_output_shape(const ITensorInfo &input, co
     permute(output_shape, perm);
     return output_shape;
 }
-inline TensorShape compute_interleaved_shape(const ITensorInfo &a)
+inline TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_interleave4x4_height = 1)
 {
-    // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+    // The interleaved output matrix will have the following shape: [ a_height * W, ceil(a_width / W) ] where W = 4 * mult_interleave4x4_height
+    ARM_COMPUTE_ERROR_ON(mult_interleave4x4_height < 1);
+    const int   interleave_width = 4 * mult_interleave4x4_height;
     TensorShape shape_interleaved_a{ a.tensor_shape() };
-    shape_interleaved_a.set(0, a.dimension(0) * 4);
-    shape_interleaved_a.set(1, std::ceil(a.dimension(1) / 4.f));
+    shape_interleaved_a.set(0, a.dimension(0) * interleave_width);
+    shape_interleaved_a.set(1, std::ceil(a.dimension(1) / static_cast<float>(interleave_width)));
 
     return shape_interleaved_a;
 }
@@ -57,12 +59,14 @@ inline TensorShape compute_transpose1xW_shape(const ITensorInfo &b)
 
     return shape_transposed1xW_b;
 }
-inline TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInfo &b)
+inline TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInfo &b, int mult_transpose1xW_width = 1)
 {
-    // The transpose1xW output matrix will have the following shape:
-    // [ b_height * (16 / element_size), ceil(b_width / (16.0f / element_size) ]
+    // Note: mult_transpose1xW_width expresses the number of chunks with size 1x(W) we want to store on the same row
+    //       The transpose1xW output matrix will have the following shape:
+    //       [ b_height * W, ceil(b_width / W) ] where W = (16 / element size of the tensor) * mult_transpose1xW_width
+    ARM_COMPUTE_ERROR_ON(mult_transpose1xW_width < 1);
     TensorShape  shape_transposed1xW_b{ b.tensor_shape() };
-    const size_t transpose_width = 16 / b.element_size();
+    const size_t transpose_width = (16 / b.element_size()) * mult_transpose1xW_width;
     shape_transposed1xW_b.set(0, b.dimension(1) * transpose_width);
     shape_transposed1xW_b.set(1, static_cast<size_t>(std::ceil(b.dimension(0) / static_cast<float>(transpose_width))));
 
diff --git a/arm_compute/graph/Graph.h b/arm_compute/graph/Graph.h
index ab1d8b8866..853b90df82 100644
--- a/arm_compute/graph/Graph.h
+++ b/arm_compute/graph/Graph.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef __ARM_COMPUTE_GRAPH_GRAPH_H__
 #define __ARM_COMPUTE_GRAPH_GRAPH_H__
 
+#include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/graph/INode.h"
 #include "arm_compute/graph/ITensorObject.h"
 #include "arm_compute/graph/SubTensor.h"
@@ -67,9 +68,12 @@ public:
      * @param[in] tensor Tensor to add
      */
     void add_tensor_object(std::unique_ptr<ITensorObject> tensor);
-    /** Finalizes the current node's configuration
+    /** Check if the OpenCL target is available
      */
     static bool opencl_is_available();
+    /** Returns the GPU target
+     */
+    static GPUTarget gpu_target();
     /** Manually sets the output of the current node
      *
      * @param[in] tmp Output info to set
diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h
index bf41226bda..0f144915d7 100644
--- a/arm_compute/runtime/CL/functions/CLGEMM.h
+++ b/arm_compute/runtime/CL/functions/CLGEMM.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,7 +68,8 @@ public:
      * @param[in]  alpha     Weight of the matrix product
      * @param[in]  beta      Weight of matrix C
      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                       if the reshape of matrix B should happen only for the first run
+     *                       if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping
+     *                       in case matrix A and matrix B have been already transformed.
      */
     void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
 
diff --git a/examples/graph_alexnet.cpp b/examples/graph_alexnet.cpp
index 8705c8ed1e..2f2c8bd182 100644
--- a/examples/graph_alexnet.cpp
+++ b/examples/graph_alexnet.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,8 +54,10 @@ public:
         constexpr float mean_b = 104.01f; /* Mean value to subtract from blue channel */
 
         // Set target. 0 (NEON), 1 (OpenCL). By default it is NEON
-        TargetHint            target_hint      = set_target_hint(argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0);
-        ConvolutionMethodHint convolution_hint = target_hint == TargetHint::NEON ? ConvolutionMethodHint::GEMM : ConvolutionMethodHint::DIRECT;
+        TargetHint target_hint = set_target_hint(argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0);
+
+        const bool            is_gemm_convolution5x5 = Graph::gpu_target() == arm_compute::GPUTarget::MIDGARD || target_hint == TargetHint::NEON;
+        ConvolutionMethodHint convolution_5x5_hint   = is_gemm_convolution5x5 ? ConvolutionMethodHint::GEMM : ConvolutionMethodHint::DIRECT;
 
         // Parse arguments
         if(argc < 2)
@@ -102,7 +104,7 @@ public:
               << NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f))
               << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0)))
               // Layer 2
-              << convolution_hint
+              << convolution_5x5_hint
               << ConvolutionLayer(
                   5U, 5U, 256U,
                   get_weights_accessor(data_path, "/cnn_data/alexnet_model/conv2_w.npy"),
@@ -111,6 +113,7 @@ public:
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
               << NormalizationLayer(NormalizationLayerInfo(NormType::CROSS_MAP, 5, 0.0001f, 0.75f))
               << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0)))
+              << ConvolutionMethodHint::GEMM
               // Layer 3
               << ConvolutionLayer(
                   3U, 3U, 384U,
diff --git a/examples/graph_googlenet.cpp b/examples/graph_googlenet.cpp
index 1e9601b492..b2e2f1bf8f 100644
--- a/examples/graph_googlenet.cpp
+++ b/examples/graph_googlenet.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,7 +55,7 @@ public:
 
         // Set target. 0 (NEON), 1 (OpenCL). By default it is NEON
         TargetHint            target_hint      = set_target_hint(argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0);
-        ConvolutionMethodHint convolution_hint = target_hint == TargetHint::NEON ? ConvolutionMethodHint::GEMM : ConvolutionMethodHint::DIRECT;
+        ConvolutionMethodHint convolution_hint = ConvolutionMethodHint::GEMM;
 
         // Parse arguments
         if(argc < 2)
diff --git a/examples/graph_mobilenet.cpp b/examples/graph_mobilenet.cpp
index 8c3f9b6fbc..83d1db9f0f 100644
--- a/examples/graph_mobilenet.cpp
+++ b/examples/graph_mobilenet.cpp
@@ -52,7 +52,7 @@ public:
 
         // Set target. 0 (NEON), 1 (OpenCL). By default it is NEON
         TargetHint            target_hint      = set_target_hint(argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0);
-        ConvolutionMethodHint convolution_hint = target_hint == TargetHint::NEON ? ConvolutionMethodHint::GEMM : ConvolutionMethodHint::DIRECT;
+        ConvolutionMethodHint convolution_hint = ConvolutionMethodHint::GEMM;
 
         // Set model to execute. 0 (MobileNetV1_1.0_224), 1 (MobileNetV1_0.75_160)
         int model_id = (argc > 2) ? std::strtol(argv[2], nullptr, 10) : 0;
diff --git a/examples/graph_squeezenet.cpp b/examples/graph_squeezenet.cpp
index b21f2fe5c4..e85108702d 100644
--- a/examples/graph_squeezenet.cpp
+++ b/examples/graph_squeezenet.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,8 +59,7 @@ public:
         constexpr float mean_b = 104.01f; /* Mean value to subtract from blue channel */
 
         // Set target. 0 (NEON), 1 (OpenCL). By default it is NEON
-        TargetHint            target_hint      = set_target_hint(argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0);
-        ConvolutionMethodHint convolution_hint = target_hint == TargetHint::NEON ? ConvolutionMethodHint::GEMM : ConvolutionMethodHint::DIRECT;
+        TargetHint target_hint = set_target_hint(argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0);
 
         // Parse arguments
         if(argc < 2)
@@ -102,7 +101,6 @@ public:
                   get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv1_w.npy"),
                   get_weights_accessor(data_path, "/cnn_data/squeezenet_v1.0_model/conv1_b.npy"),
                   PadStrideInfo(2, 2, 0, 0))
-              << convolution_hint
               << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
               << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::CEIL)))
               << ConvolutionLayer(
diff --git a/examples/graph_vgg16.cpp b/examples/graph_vgg16.cpp
index 1a804a4882..d97c5b5d02 100644
--- a/examples/graph_vgg16.cpp
+++ b/examples/graph_vgg16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -91,7 +91,6 @@ public:
               << convolution_hint
               << Tensor(TensorInfo(TensorShape(224U, 224U, 3U, 1U), 1, DataType::F32),
                         get_input_accessor(image, mean_r, mean_g, mean_b))
-              << ConvolutionMethodHint::DIRECT
               // Layer 1
               << ConvolutionLayer(
                   3U, 3U, 64U,
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 6695881d09..ae3553860a 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -211,9 +211,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "gaussian1x5_sub_x", "gaussian_pyramid.cl" },
     { "gaussian5x1_sub_y", "gaussian_pyramid.cl" },
     { "gemm_accumulate_biases", "gemm.cl" },
-    { "gemm_interleave4x4_8bit", "gemm.cl" },
-    { "gemm_interleave4x4_16bit", "gemm.cl" },
-    { "gemm_interleave4x4_32bit", "gemm.cl" },
+    { "gemm_interleave4x4", "gemm.cl" },
     { "gemm_ma_f16", "gemm.cl" },
     { "gemm_ma_f32", "gemm.cl" },
     { "gemm_ma_qs8", "gemm.cl" },
@@ -230,9 +228,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "gemm_mm_qs8", "gemm.cl" },
     { "gemm_mm_qs16", "gemm.cl" },
     { "gemm_lc_vm_f32", "gemm.cl" },
-    { "gemm_transpose1x16", "gemm.cl" },
-    { "gemm_transpose1x8", "gemm.cl" },
-    { "gemm_transpose1x4", "gemm.cl" },
+    { "gemm_transpose1xW", "gemm.cl" },
     { "gemmlowp_matrix_a_reduction", "gemmlowp.cl" },
     { "gemmlowp_matrix_b_reduction", "gemmlowp.cl" },
     { "gemmlowp_mm_bifrost", "gemmlowp.cl" },
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index c763cb355b..bad09f3c42 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,41 +27,23 @@
 #include "fixed_point.h"
 #endif // FIXED_POINT_POSITION
 
-/** This OpenCL kernel computes the "vector" 1x4 transposition of input matrix
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U32/S32/F32
- * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_transpose1x4(IMAGE_DECLARATION(src),
-                                IMAGE_DECLARATION(dst))
-{
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
+#if defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH)
 
-    // Compute address for Matrix B - source
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
-    // Compute address for Matrix B transposed - destination. X and Y are swapped
-    uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
+#if TRANSPOSE_W == 4
+#define DATA_TYPE uint
+#elif TRANSPOSE_W == 8
+#define DATA_TYPE ushort
+#elif TRANSPOSE_W == 16
+#define DATA_TYPE uchar
+#else // TRANSPOSE_W == 16
+#error "Transpose width not supported"
+#endif // TRANSPOSE_W
 
-    uint4 b0 = vload4(0, (__global uint *)src.ptr);
-
-    vstore4(b0, 0, (__global uint *)(dst_ptr + dst_addr_in_bytes));
-}
-
-/** This OpenCL kernel computes the "vector" 1x8 transposition of input matrix
+/** This OpenCL kernel computes the "vector" 1xW transposition of input matrix
+ *
+ * @attention The multiplication factor (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
  *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U16/S16/QS16/F16
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
  * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
@@ -69,12 +51,12 @@ __kernel void gemm_transpose1x4(IMAGE_DECLARATION(src),
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
  * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
  */
-__kernel void gemm_transpose1x8(IMAGE_DECLARATION(src),
+__kernel void gemm_transpose1xW(IMAGE_DECLARATION(src),
                                 IMAGE_DECLARATION(dst))
 {
     uint x = get_global_id(0);
@@ -84,16 +66,22 @@ __kernel void gemm_transpose1x8(IMAGE_DECLARATION(src),
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
 
     // Compute address for Matrix B transposed - destination. X and Y are swapped
-    uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
+    uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + y * TRANSPOSE_W * sizeof(DATA_TYPE) * MULT_TRANSPOSE1XW_WIDTH + (x / MULT_TRANSPOSE1XW_WIDTH) * dst_stride_y +
+                             (x % MULT_TRANSPOSE1XW_WIDTH) * TRANSPOSE_W * sizeof(DATA_TYPE);
 
-    ushort8 b0 = vload8(0, (__global ushort *)src.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, TRANSPOSE_W)
+    b0 = VLOAD(TRANSPOSE_W)(0, (__global DATA_TYPE *)src.ptr);
 
-    vstore8(b0, 0, (__global ushort *)(dst_ptr + dst_addr_in_bytes));
+    VSTORE(TRANSPOSE_W)
+    (b0, 0, (__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes));
 }
+#endif // defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH)
 
-/** This OpenCL kernel computes the "vector" 1x16 transposition of input matrix
+#if defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)
+
+/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
  *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QS8
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
  * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
@@ -106,9 +94,10 @@ __kernel void gemm_transpose1x8(IMAGE_DECLARATION(src),
  * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
  */
-__kernel void gemm_transpose1x16(IMAGE_DECLARATION(src),
+__kernel void gemm_interleave4x4(IMAGE_DECLARATION(src),
                                  IMAGE_DECLARATION(dst))
 {
+    // Compute source and destination addresses
     uint x = get_global_id(0);
     uint y = get_global_id(1);
 
@@ -116,141 +105,35 @@ __kernel void gemm_transpose1x16(IMAGE_DECLARATION(src),
     Image src = CONVERT_TO_IMAGE_STRUCT(src);
 
     // Compute address for Matrix B transposed - destination. X and Y are swapped
-    uint dst_addr_in_bytes = y * 16 + ((x * dst_stride_y + dst_offset_first_element_in_bytes));
-
-    uchar16 b0 = vload16(0, (__global uchar *)src.ptr);
-
-    vstore16(b0, 0, (__global uchar *)(dst_ptr + dst_addr_in_bytes));
-}
-
-/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U32/S32/F32
- * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_interleave4x4_32bit(IMAGE_DECLARATION(src),
-                                       IMAGE_DECLARATION(dst))
-{
-    // Compute source and destination addresses
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values from Matrix A
-    uint4 a0 = vload4(0, (__global uint *)(offset(&src, 0, 0)));
-    uint4 a1 = vload4(0, (__global uint *)(offset(&src, 0, 1)));
-    uint4 a2 = vload4(0, (__global uint *)(offset(&src, 0, 2)));
-    uint4 a3 = vload4(0, (__global uint *)(offset(&src, 0, 3)));
-
-    uint4 val0 = (uint4)(a0.s0, a1.s0, a2.s0, a3.s0);
-    vstore4(val0, 0, ((__global uint *)dst.ptr) + 0);
-
-    val0 = (uint4)(a0.s1, a1.s1, a2.s1, a3.s1);
-    vstore4(val0, 0, ((__global uint *)dst.ptr) + 4);
-
-    val0 = (uint4)(a0.s2, a1.s2, a2.s2, a3.s2);
-    vstore4(val0, 0, ((__global uint *)dst.ptr) + 8);
-
-    val0 = (uint4)(a0.s3, a1.s3, a2.s3, a3.s3);
-    vstore4(val0, 0, ((__global uint *)dst.ptr) + 12);
-}
-
-/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U16/S16/QS16/F16
- * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_interleave4x4_16bit(IMAGE_DECLARATION(src),
-                                       IMAGE_DECLARATION(dst))
-{
-    // Compute source and destination addresses
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Load values from Matrix A
-    ushort8 a0 = vload8(0, (__global ushort *)(offset(&src, 0, 0)));
-    ushort8 a1 = vload8(0, (__global ushort *)(offset(&src, 0, 1)));
-    ushort8 a2 = vload8(0, (__global ushort *)(offset(&src, 0, 2)));
-    ushort8 a3 = vload8(0, (__global ushort *)(offset(&src, 0, 3)));
-
-    ushort8 val0 = (ushort8)((ushort4)(a0.s0, a1.s0, a2.s0, a3.s0), (ushort4)(a0.s1, a1.s1, a2.s1, a3.s1));
-    vstore8(val0, 0, ((__global ushort *)dst.ptr) + 0);
-
-    val0 = (ushort8)((ushort4)(a0.s2, a1.s2, a2.s2, a3.s2), (ushort4)(a0.s3, a1.s3, a2.s3, a3.s3));
-    vstore8(val0, 0, ((__global ushort *)dst.ptr) + 8);
-
-    val0 = (ushort8)((ushort4)(a0.s4, a1.s4, a2.s4, a3.s4), (ushort4)(a0.s5, a1.s5, a2.s5, a3.s5));
-    vstore8(val0, 0, ((__global ushort *)dst.ptr) + 16);
-
-    val0 = (ushort8)((ushort4)(a0.s6, a1.s6, a2.s6, a3.s6), (ushort4)(a0.s7, a1.s7, a2.s7, a3.s7));
-    vstore8(val0, 0, ((__global ushort *)dst.ptr) + 24);
-}
-
-/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block and interleaving the values
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QS8
- * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src),
-                                      IMAGE_DECLARATION(dst))
-{
-    // Compute source and destination addresses
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+    uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * 16 * MULT_INTERLEAVE4X4_HEIGHT + (y / MULT_INTERLEAVE4X4_HEIGHT) * dst_stride_y +
+                             (y % MULT_INTERLEAVE4X4_HEIGHT) * 4 * sizeof(DATA_TYPE);
 
     // Load values from Matrix A
-    uchar16 a0 = vload16(0, (__global uchar *)(offset(&src, 0, 0)));
-    uchar16 a1 = vload16(0, (__global uchar *)(offset(&src, 0, 1)));
-    uchar16 a2 = vload16(0, (__global uchar *)(offset(&src, 0, 2)));
-    uchar16 a3 = vload16(0, (__global uchar *)(offset(&src, 0, 3)));
-
-    uchar16 val0 = (uchar16)((uchar4)(a0.s0, a1.s0, a2.s0, a3.s0), (uchar4)(a0.s1, a1.s1, a2.s1, a3.s1),
-                             (uchar4)(a0.s2, a1.s2, a2.s2, a3.s2), (uchar4)(a0.s3, a1.s3, a2.s3, a3.s3));
-    vstore16(val0, 0, ((__global uchar *)dst.ptr) + 0);
-
-    val0 = (uchar16)((uchar4)(a0.s4, a1.s4, a2.s4, a3.s4), (uchar4)(a0.s5, a1.s5, a2.s5, a3.s5),
-                     (uchar4)(a0.s6, a1.s6, a2.s6, a3.s6), (uchar4)(a0.s7, a1.s7, a2.s7, a3.s7));
-    vstore16(val0, 0, ((__global uchar *)dst.ptr) + 16);
-
-    val0 = (uchar16)((uchar4)(a0.s8, a1.s8, a2.s8, a3.s8), (uchar4)(a0.s9, a1.s9, a2.s9, a3.s9),
-                     (uchar4)(a0.sA, a1.sA, a2.sA, a3.sA), (uchar4)(a0.sB, a1.sB, a2.sB, a3.sB));
-    vstore16(val0, 0, ((__global uchar *)dst.ptr) + 32);
-
-    val0 = (uchar16)((uchar4)(a0.sC, a1.sC, a2.sC, a3.sC), (uchar4)(a0.sD, a1.sD, a2.sD, a3.sD),
-                     (uchar4)(a0.sE, a1.sE, a2.sE, a3.sE), (uchar4)(a0.sF, a1.sF, a2.sF, a3.sF));
-    vstore16(val0, 0, ((__global uchar *)dst.ptr) + 48);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    a0 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 0)));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    a1 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 1)));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    a2 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 2)));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    a3 = vload4(0, (__global DATA_TYPE *)(offset(&src, 0, 3)));
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s0, a1.s0, a2.s0, a3.s0);
+    vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT));
+
+    val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s1, a1.s1, a2.s1, a3.s1);
+    vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 4 * MULT_INTERLEAVE4X4_HEIGHT));
+
+    val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s2, a1.s2, a2.s2, a3.s2);
+    vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 8 * MULT_INTERLEAVE4X4_HEIGHT));
+
+    val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s3, a1.s3, a2.s3, a3.s3);
+    vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT));
 }
+#endif // defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)
 
-#if defined(COLS_B)
+#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
 /** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
  *
@@ -270,30 +153,32 @@ __kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src),
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
  */
 __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0),
                                                          IMAGE_DECLARATION(src1),
                                                          IMAGE_DECLARATION(dst))
 {
-    // src_addr.s0 = address of matrix A
-    // src_addr.s1 = address of matrix B
-
-    // Compute address for matrix A and B
-    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
-                                                                        (src1_stride_y));
+    int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
 
-    // Add offset_first_element_in_bytes
-    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+    // Offset
+    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+    const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
 
-    // Divide by 4 in order to get the src_addr in unit of float
-    src_addr = src_addr >> 2;
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    __global float *src_addr_a = (__global float *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
+    __global float *src_addr_b = (__global float *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
 
     // Compute end row address for matrix B
-    int end_row_mtx_b = src_addr.s1 + COLS_B;
+    __global float *src_end_addr_b = src_addr_b + COLS_B;
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
 
     // Reset accumulators
     float4 c00 = 0.0f;
@@ -301,11 +186,11 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0)
     float4 c20 = 0.0f;
     float4 c30 = 0.0f;
 
-    for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 8))
+    for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
     {
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
-        float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
 
         c00 += (float4)a0.s0 * b0;
         c10 += (float4)a0.s1 * b0;
@@ -313,8 +198,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0)
         c30 += (float4)a0.s3 * b0;
 
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0 + 4);
-        b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1 + 4);
+        a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
+        b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);
 
         c00 += (float4)a0.s0 * b0;
         c10 += (float4)a0.s1 * b0;
@@ -322,11 +207,11 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0)
         c30 += (float4)a0.s3 * b0;
     }
 
-    for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 4))
+    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)
     {
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);
-        float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
 
         c00 += (float4)a0.s0 * b0;
         c10 += (float4)a0.s1 * b0;
@@ -371,23 +256,33 @@ __kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0)
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
  */
 __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
                                                          IMAGE_DECLARATION(src1),
                                                          IMAGE_DECLARATION(dst))
 {
+    int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+
+    // Offset
+    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+    const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
+
     // src_addr_a = address of matrix A
     // src_addr_b = address of matrix B
-    __global float *src_addr_a = (__global float *)(src0_ptr + get_global_id(1) * src0_stride_y + src0_offset_first_element_in_bytes);
-    __global float *src_addr_b = (__global float *)(src1_ptr + get_global_id(0) * src1_stride_y + src1_offset_first_element_in_bytes);
+    __global float *src_addr_a = (__global float *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
+    __global float *src_addr_b = (__global float *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
 
     // Compute end row address for matrix B
     __global float *src_end_addr_b = src_addr_b + COLS_B;
 
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
+
     // Reset accumulators
     float c00 = 0.0f;
     float c01 = 0.0f;
@@ -406,7 +301,7 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
     float c32 = 0.0f;
     float c33 = 0.0f;
 
-    for(; src_addr_b <= (src_end_addr_b - 16); src_addr_a += 16, src_addr_b += 16)
+    for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += (16 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (16 * MULT_TRANSPOSE1XW_WIDTH))
     {
         // Load values from matrix A (interleaved) and matrix B (transposed)
         float4 a0 = vload4(0, src_addr_a);
@@ -433,8 +328,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
         c33 = fma(a0.s3, b0.s3, c33);
 
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a + 4);
-        b0 = vload4(0, src_addr_b + 4);
+        a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
+        b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);
 
         c00 = fma(a0.s0, b0.s0, c00);
         c01 = fma(a0.s0, b0.s1, c01);
@@ -457,8 +352,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
         c33 = fma(a0.s3, b0.s3, c33);
 
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a + 8);
-        b0 = vload4(0, src_addr_b + 8);
+        a0 = vload4(0, src_addr_a + 8 * MULT_INTERLEAVE4X4_HEIGHT);
+        b0 = vload4(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);
 
         c00 = fma(a0.s0, b0.s0, c00);
         c01 = fma(a0.s0, b0.s1, c01);
@@ -481,8 +376,8 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
         c33 = fma(a0.s3, b0.s3, c33);
 
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a + 12);
-        b0 = vload4(0, src_addr_b + 12);
+        a0 = vload4(0, src_addr_a + 12 * MULT_INTERLEAVE4X4_HEIGHT);
+        b0 = vload4(0, src_addr_b + 12 * MULT_TRANSPOSE1XW_WIDTH);
 
         c00 = fma(a0.s0, b0.s0, c00);
         c01 = fma(a0.s0, b0.s1, c01);
@@ -505,7 +400,7 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
         c33 = fma(a0.s3, b0.s3, c33);
     }
 
-    for(; src_addr_b < src_end_addr_b; src_addr_a += 4, src_addr_b += 4)
+    for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * MULT_TRANSPOSE1XW_WIDTH))
     {
         // Load values from matrix A (interleaved) and matrix B (transposed)
         float4 a0 = vload4(0, src_addr_a);
@@ -555,8 +450,6 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
     c33 = c33 * ALPHA;
 #endif // defined(ALPHA)
 
-    barrier(CLK_GLOBAL_MEM_FENCE);
-
     // Store 4x4 block
     vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(offset(&dst, 0, 0)));
     vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(offset(&dst, 0, 1)));
@@ -584,30 +477,32 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
  */
 __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
                                                  IMAGE_DECLARATION(src1),
                                                  IMAGE_DECLARATION(dst))
 {
-    // src_addr.s0 = address of matrix A
-    // src_addr.s1 = address of matrix B
-
-    // Compute address for matrix A and B
-    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
-                                                                        (src1_stride_y));
+    int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
 
-    // Add offset_first_element_in_bytes
-    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+    // Offset
+    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+    const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
 
-    // Divide by 2 in order to get the src_addr in unit of half
-    src_addr = src_addr >> 1;
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    __global half *src_addr_a = (__global half *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
+    __global half *src_addr_b = (__global half *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
 
     // Compute end row address for matrix B
-    int end_row_mtx_b = src_addr.s1 + COLS_B;
+    __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
 
     // Reset accumulators
     half8 c00 = 0.0f;
@@ -615,11 +510,11 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
     half8 c20 = 0.0f;
     half8 c30 = 0.0f;
 
-    for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(8, 16))
+    for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
     {
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
-        half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
+        half4 a0 = vload4(0, src_addr_a);
+        half8 b0 = vload8(0, src_addr_b);
 
         c00 += (half8)a0.s0 * b0;
         c10 += (half8)a0.s1 * b0;
@@ -627,8 +522,8 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
         c30 += (half8)a0.s3 * b0;
 
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0 + 4);
-        b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1 + 8);
+        a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
+        b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);
 
         c00 += (half8)a0.s0 * b0;
         c10 += (half8)a0.s1 * b0;
@@ -636,11 +531,11 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
         c30 += (half8)a0.s3 * b0;
     }
 
-    for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 8))
+    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
     {
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);
-        half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);
+        half4 a0 = vload4(0, src_addr_a);
+        half8 b0 = vload8(0, src_addr_b);
 
         c00 += (half8)a0.s0 * b0;
         c10 += (half8)a0.s1 * b0;
@@ -689,27 +584,32 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
  */
 __kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),
                                                  IMAGE_DECLARATION(src1),
                                                  IMAGE_DECLARATION(dst))
 {
-    // src_addr.s0 = address of matrix A
-    // src_addr.s1 = address of matrix B
+    int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
 
-    // Compute address for matrix A and B
-    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
-                                                                        (src1_stride_y));
+    // Offset
+    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+    const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 16;
 
-    // Add offset_first_element_in_bytes
-    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    __global char *src_addr_a = src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    __global char *src_addr_b = src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes;
 
     // Compute end row address for matrix B
-    int end_row_mtx_b = src_addr.s1 + COLS_B;
+    __global char *src_end_addr_b = src_addr_b + COLS_B;
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
 
     // Reset accumulators
     short8 c00 = 0.0f;
@@ -722,11 +622,11 @@ __kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),
     short8 c31 = 0.0f;
 
     // This for loop performs 1 accumulation for each iteration
-    for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(4, 16))
+    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
     {
         // Load values from matrix A (interleaved) and matrix B (transposed)
-        char4  a0 = vload4(0, ((__global char *)src0_ptr) + src_addr.s0);
-        char16 b0 = vload16(0, ((__global char *)src1_ptr) + src_addr.s1);
+        char4  a0 = vload4(0, src_addr_a);
+        char16 b0 = vload16(0, src_addr_b);
 
         c00 = mlal_sat_qs8x8(c00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);
         c10 = mlal_sat_qs8x8(c10, (char8)a0.s1, b0.s01234567, FIXED_POINT_POSITION);
@@ -783,30 +683,32 @@ __kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),
  * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
  * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
  * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
  */
 __kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),
                                                   IMAGE_DECLARATION(src1),
                                                   IMAGE_DECLARATION(dst))
 {
-    // src_addr.s0 = address of matrix A
-    // src_addr.s1 = address of matrix B
+    int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+    int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
 
-    // Compute address for matrix A and B
-    int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),
-                                                                        (src1_stride_y));
+    // Offset
+    const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+    const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
 
-    // Add offset_first_element_in_bytes
-    src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Divide by 2 in order to get the src_addr in unit of short
-    src_addr = src_addr >> 1;
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    __global short *src_addr_a = (__global short *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);
+    __global short *src_addr_b = (__global short *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);
 
     // Compute end row address for matrix B
-    int end_row_mtx_b = src_addr.s1 + COLS_B;
+    __global short *src_end_addr_b = src_addr_b + COLS_B;
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
 
     // Reset accumulators
     int8 c00 = 0.0f;
@@ -815,11 +717,11 @@ __kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),
     int8 c30 = 0.0f;
 
     // This for loop performs 1 accumulation for each iteration
-    for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(4, 8))
+    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
     {
         /* Load values from matrix A (interleaved) and matrix B (transposed) */
-        short4 a0 = vload4(0, ((__global short *)src0_ptr) + src_addr.s0);
-        short8 b0 = vload8(0, ((__global short *)src1_ptr) + src_addr.s1);
+        short4 a0 = vload4(0, src_addr_a);
+        short8 b0 = vload8(0, src_addr_b);
 
         c00 = mlal_sat_qs16x8(c00, (short8)a0.s0, b0, FIXED_POINT_POSITION);
         c10 = mlal_sat_qs16x8(c10, (short8)a0.s1, b0, FIXED_POINT_POSITION);
@@ -850,7 +752,7 @@ __kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),
     vstore8(c30_qs16, 0, (__global short *)(offset(&dst, 0, 3)));
 }
 #endif // defined(FIXED_POINT_POSITION)
-#endif // defined(COLS_B)
+#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
 
 #if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
 #if defined(DATA_TYPE)
diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
index 6886f54602..241dd8549d 100644
--- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,17 +40,16 @@ using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON(mult_interleave4x4_height < 1);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
                                                          DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 
     if(output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_interleaved_shape(*input));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_interleaved_shape(*input, mult_interleave4x4_height));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     }
@@ -58,11 +57,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, int mult_interleave4x4_height)
 {
-    unsigned int           num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input->data_type());
+    constexpr unsigned int num_elems_processed_per_iteration_x = 4;
     constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-    const unsigned int     num_elems_written_per_iteration     = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y;
+    const unsigned int     num_elems_written_per_iteration     = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y * mult_interleave4x4_height;
     bool                   window_changed                      = false;
 
     // Configure kernel window
@@ -73,7 +72,10 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     // Configure window in case of configured output
     if(output->total_size() != 0)
     {
-        AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f);
+        const float scale_x = 4.0f * static_cast<float>(mult_interleave4x4_height);
+        const float scale_y = 1.0f / (scale_x);
+
+        AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration, 1, scale_x, scale_y);
         window_changed = window_changed || update_window_and_padding(win, output_access);
         output_access.set_valid_region(win, input->valid_region());
     }
@@ -88,25 +90,42 @@ CLGEMMInterleave4x4Kernel::CLGEMMInterleave4x4Kernel()
 {
 }
 
-void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output)
+void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_interleaved_shape(*input->info())));
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_interleaved_shape(*input->info(), mult_interleave4x4_height)));
 
     // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mult_interleave4x4_height));
 
     _input  = input;
     _output = output;
 
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));
+    switch(input->info()->element_size())
+    {
+        case 1:
+            build_opts.add_option("-DDATA_TYPE=uchar");
+            break;
+        case 2:
+            build_opts.add_option("-DDATA_TYPE=ushort");
+            break;
+        case 4:
+            build_opts.add_option("-DDATA_TYPE=uint");
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Data type not supported");
+    }
+
     // Create kernel
-    std::string kernel_name = "gemm_interleave4x4_" + support::cpp11::to_string(input->info()->element_size() * 8) + "bit";
-    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_interleave4x4", build_opts.options()));
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info());
+    auto win_config = validate_and_configure_window(input->info(), output->info(), mult_interleave4x4_height);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure(win_config.second);
 
@@ -119,10 +138,10 @@ void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *out
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
 }
 
-Status CLGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+Status CLGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mult_interleave4x4_height));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), mult_interleave4x4_height).first);
 
     return Status{};
 }
@@ -144,10 +163,6 @@ void CLGEMMInterleave4x4Kernel::run(const Window &window, cl::CommandQueue &queu
     Window in_slice  = window.first_slice_window_2D();
     Window out_slice = window.first_slice_window_2D();
 
-    // Change x and y steps for the slide of output tensor
-    out_slice.scale(Window::DimX, 4.f);
-    out_slice.scale(Window::DimY, 0.25f);
-
     do
     {
         unsigned int idx = 0;
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 19f38bf5a5..e23feb269a 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,24 +36,68 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include <set>
 #include <string>
 
 using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
 using ElementsProcessed = Steps;
 
-inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed)
+inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1);
+
     if(!is_interleaved_transposed)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
+
+        if(output->total_size() != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0));
+            ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1));
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output);
+        }
+    }
+    else
+    {
+        const int m                         = reshape_info.m();
+        const int n                         = reshape_info.n();
+        const int k                         = reshape_info.k();
+        const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
+        const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+
+        TensorShape tensor_shape0{ input0->tensor_shape() };
+        tensor_shape0.set(0, k);
+        tensor_shape0.set(1, m);
+
+        TensorShape tensor_shape1{ input1->tensor_shape() };
+        tensor_shape1.set(0, n);
+        tensor_shape1.set(1, k);
+
+        const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
+        const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
+
+        const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
+        const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
+
+        if(output->total_size() != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != static_cast<size_t>(n));
+            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast<size_t>(m));
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output);
+        }
     }
 
     return Status{};
@@ -122,12 +166,19 @@ CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
 {
 }
 
-void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed)
+void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
 
+    // Output tensor auto inizialitation if not yet initialized
+    TensorShape tensor_shape{ input0->info()->tensor_shape() };
+    tensor_shape.set(0, is_interleaved_transposed ? reshape_info.n() : input1->info()->dimension(0));
+    tensor_shape.set(1, is_interleaved_transposed ? reshape_info.m() : input0->info()->dimension(1));
+
+    auto_init_if_empty(*output->info(), input0->info()->clone()->set_tensor_shape(tensor_shape));
+
     // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));
 
     _input0 = input0;
     _input1 = input1;
@@ -176,7 +227,13 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
     std::string kernel_name;
     if(is_interleaved_transposed)
     {
+        const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
+        const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
+
         build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)));
+        build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width));
+        build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));
+
         if(data_type == DataType::F32)
         {
             kernel_name = "gemm_mm_interleaved_transposed_f32_" + string_from_target(arch_target);
@@ -230,11 +287,13 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
     _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
 }
 
-Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, GPUTarget gpu_target)
+Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed,
+                                            const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target)
 {
+    // Note: num_elements_processed will be set in validate_and_configure_window()
     ElementsProcessed num_elements_processed{};
     ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
                                                               input1->clone().get(),
                                                               output->clone().get(),
diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
index 69a545b76b..63aed6df32 100644
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,8 +42,9 @@ using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int mult_transpose1xW_width)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON(mult_transpose1xW_width < 1);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
                                                          DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
@@ -51,7 +52,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
     if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
-                                                           compute_transpose1xW_with_element_size_shape(*input));
+                                                           compute_transpose1xW_with_element_size_shape(*input, mult_transpose1xW_width));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     }
@@ -59,11 +60,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration, int mult_transpose1xW_width)
 {
     num_elems_processed_per_iteration = 16 / input->element_size();
 
-    const int scale_x        = num_elems_processed_per_iteration;
+    const int scale_x        = num_elems_processed_per_iteration * mult_transpose1xW_width;
     bool      window_changed = false;
 
     // Configure kernel window
@@ -90,25 +91,31 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 }
 } // namespace
 
-void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output)
+void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output, int mult_transpose1xW_width)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*input->info())));
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*input->info(), mult_transpose1xW_width)));
 
     // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mult_transpose1xW_width));
 
     _input  = input;
     _output = output;
 
     // Configure kernel window
+    // Note: num_elems_processed_per_iteration will be set in validate_and_configure_window()
     unsigned int num_elems_processed_per_iteration = 1;
-    auto         win_config                        = validate_and_configure_window(input->info(), output->info(), num_elems_processed_per_iteration);
+    auto         win_config                        = validate_and_configure_window(input->info(), output->info(), num_elems_processed_per_iteration, mult_transpose1xW_width);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure(win_config.second);
 
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DTRANSPOSE_W=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width));
+
     /*
      * Following an example of how the transposition1xW works when the input data type is F32
      *
@@ -117,18 +124,18 @@ void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *outp
      *         |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 |
      *         |a30 a31 a32 a33|
      *
-     * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
+     * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) * mult_transpose1xW_width
      */
     // Create kernel
-    std::string kernel_name = "gemm_transpose1x" + support::cpp11::to_string(num_elems_processed_per_iteration);
-    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+    std::string kernel_name = "gemm_transpose1xW";
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 }
 
-Status CLGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+Status CLGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int mult_transpose1xW_width)
 {
     unsigned int num_elems_processed_per_iteration = 1;
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mult_transpose1xW_width));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration, mult_transpose1xW_width).first);
 
     return Status{};
 }
diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp
index ac5316f55e..e14bea0846 100644
--- a/src/graph/Graph.cpp
+++ b/src/graph/Graph.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -215,11 +215,25 @@ void Graph::add_tensor_object(std::unique_ptr<ITensorObject> tensor)
         _pimpl->_graph_output->allocate();
     }
 }
+
 bool Graph::opencl_is_available()
 {
     return arm_compute::opencl_is_available();
 }
 
+arm_compute::GPUTarget Graph::gpu_target()
+{
+    // Check if OpenCL is available before returning the GPU target
+    if(opencl_is_available())
+    {
+        return arm_compute::CLScheduler::get().target();
+    }
+    else
+    {
+        return GPUTarget::MIDGARD;
+    }
+}
+
 void Graph::set_temp(TensorInfo &&tmp)
 {
     ARM_COMPUTE_ERROR_ON(_pimpl->_graph_input == nullptr);
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 68c6576a79..e9d14db96e 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,7 +55,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&input, &weights, &output, 1.f, is_interleaved_transposed, gpu_target));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMMatrixMultiplyKernel::validate(&input, &weights, &output, 1.f, is_interleaved_transposed, GEMMReshapeInfo(), gpu_target));
     }
 
     return Status{};
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index c676a10978..a09849ab93 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -38,6 +38,30 @@
 
 using namespace arm_compute;
 
+namespace
+{
+inline bool is_interleaved_transposed(int m, int n, int k, DataType data_type, bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+{
+    bool flag = true;
+
+    if(gpu_target == GPUTarget::BIFROST)
+    {
+        // COMPMID-852
+        if(k > 256 && m > 4 && data_type == DataType::F32 && reshape_b_only_on_first_run)
+        {
+            const float scale = k < 1024 ? 2.0f : 2.5f;
+            flag              = scale * n > 1.66f * n + 38.4f;
+        }
+        else
+        {
+            flag = false;
+        }
+    }
+
+    return flag;
+}
+} // namespace
+
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false),
       _is_first_run(true), _reshape_b_only_on_first_run(false)
@@ -62,18 +86,36 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
 
     ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
 
-    // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors
-    // For Bifrost architectures we do not reshape the input matrices
-    _is_interleaved_transposed = (a->info()->dimension(1) > 16 && CLScheduler::get().target() != GPUTarget::BIFROST);
-
     // Check if we need to reshape the matrix B only on the first run
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
 
     const ICLTensor *matrix_a = a;
     const ICLTensor *matrix_b = b;
 
-    // Set the target for the matrix multiply kernel
-    _mm_kernel.set_target(CLScheduler::get().target());
+    // Get the GPU target
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
+    // Set the target for the kernels
+    _interleave_kernel.set_target(gpu_target);
+    _mm_kernel.set_target(gpu_target);
+
+    // Arguments used by GEMMReshapeInfo
+    // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
+    // in order to know how the matrices have been reshaped
+    const int m                         = a->info()->dimension(1);
+    const int n                         = b->info()->dimension(0);
+    const int k                         = a->info()->dimension(0);
+    int       mult_transpose1xW_width   = 1;
+    int       mult_interleave4x4_height = 1;
+
+    if(gpu_target == GPUTarget::BIFROST)
+    {
+        mult_transpose1xW_width   = 4;
+        mult_interleave4x4_height = 2;
+    }
+
+    // Check if we need to reshape the matrix A and matrix B
+    _is_interleaved_transposed = is_interleaved_transposed(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run, gpu_target);
 
     if(_is_interleaved_transposed)
     {
@@ -83,17 +125,17 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
         // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
 
         // Configure interleave kernel
-        _interleave_kernel.configure(a, &_tmp_a);
+        _interleave_kernel.configure(a, &_tmp_a, mult_interleave4x4_height);
 
         // Configure transpose kernel
-        _transpose_kernel.configure(b, &_tmp_b);
+        _transpose_kernel.configure(b, &_tmp_b, mult_transpose1xW_width);
 
         // Manage intermediate buffers
         _memory_group.manage(&_tmp_a);
         _memory_group.manage(&_tmp_b);
     }
 
-    _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed);
+    _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height));
 
     if(_is_interleaved_transposed)
     {
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 2cd426b82d..5f886a02c6 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -148,8 +148,8 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
         TensorInfo info_a(compute_interleaved_shape(*a), 1, a->data_type());
         TensorInfo info_b(compute_transpose1xW_shape(*b), 1, b->data_type());
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &info_a));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &info_b));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMInterleave4x4Kernel::validate(a, &info_a, 1));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMTranspose1xWKernel::validate(b, &info_b, 1));
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
     }
     else