20 files changed, 19 insertions, 982 deletions
diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
index 70ffe3030b..57e0ade7da 100644
--- a/arm_compute/core/CL/CLKernels.h
+++ b/arm_compute/core/CL/CLKernels.h
@@ -73,7 +73,6 @@
 #include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLFloorKernel.h"
 #include "arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
@@ -92,7 +91,6 @@
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
 #include "arm_compute/core/CL/kernels/CLGatherKernel.h"
 #include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
 #include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
diff --git a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
deleted file mode 100644
index 96b01b0237..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_CLGEMMINTERLEAVE4X4KERNEL_H__
-#define __ARM_COMPUTE_CLGEMMINTERLEAVE4X4KERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel which interleaves the elements of a matrix A in chunk of 4x4
- *
- * This function puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
- *
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccccccccccc}
- * a00 & a10 & a20 & a30 & a01 & a11 & a21 & a31 & a02 & a12 & a22 & a32 & a03 & a13 & a23 & a33 \\
- * \end{array} \right)
- * @f]
- *
- * After this operation, the output matrix will have the following shape: [ height * W, ceil(width / W) ] where W = 4 * mult_interleave4x4_height
- */
-class CLGEMMInterleave4x4Kernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLGEMMInterleave4x4Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMInterleave4x4Kernel(const CLGEMMInterleave4x4Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMInterleave4x4Kernel &operator=(const CLGEMMInterleave4x4Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMInterleave4x4Kernel(CLGEMMInterleave4x4Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMInterleave4x4Kernel &operator=(CLGEMMInterleave4x4Kernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                     Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
-     * @param[out] output                    Output tensor. Data type supported: same as @p input
-     * @param[in]  mult_interleave4x4_height (Optional) Multiplication factor for the height of the 4x4 interleave block
-     * @param[in]  reinterpret_input_as_3d   (Optional) True if the input has to be reinterpreted as 3D tensor
-     * @param[in]  unroll_block              (Optional) True if the 4x4 block has to be unrolled rather than transposed
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height = 1, bool reinterpret_input_as_3d = false, bool unroll_block = false);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMInterleave4x4Kernel
-     *
-     * @param[in] input                     Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
-     * @param[in] output                    Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
-     * @param[in] mult_interleave4x4_height Multiplication factor for the height of the 4x4 interleave block
-     * @param[in] reinterpret_input_as_3d   True if the input has to be reinterpreted as 3D tensor
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height, bool reinterpret_input_as_3d);
-
-    // Inherited methods overridden
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    bool             _reinterpret_input_as_3d;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLGEMMINTERLEAVE4X4KERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
index 616c269b0d..e576271780 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
@@ -58,7 +58,7 @@ public:
      * @param[in]  input0                    Input tensor containing the interleaved Matrix A. Data type supported: QASYMM8
      * @param[in]  input1                    Input tensor containing the transposed1xW Matrix B. Data type supported: same as @p input0
      * @param[out] output                    Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMReshapeRHSMatrixKernel
+     * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel
      * @param[in]  reshape_info              (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
      */
     void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
@@ -67,7 +67,7 @@ public:
      * @param[in] input0                    Input tensor info containing the interleaved Matrix A. Data type supported: QASYMM8
      * @param[in] input1                    Input tensor info containing the transposed Matrix B. Data type supported: same as @p input0
      * @param[in] output                    Output tensor info to store the result of matrix multiplication. Data type supported: S32
-     * @param[in] is_interleaved_transposed True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMReshapeRHSMatrixKernel
+     * @param[in] is_interleaved_transposed True if input0 and input1 have been reshaped respectively using @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel
      * @param[in] reshape_info              GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
      *
      * @return a status
diff --git a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
deleted file mode 100644
index 47a4ad515b..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_CLGEMMTRANSPOSE1XWKERNEL_H__
-#define __ARM_COMPUTE_CLGEMMTRANSPOSE1XWKERNEL_H__
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel which transposes the elements of a matrix in chunks of 1xW, where W is equal to (16 / element size of the tensor)
- *
- * Following an example of how the transposition1xW works when the input data type is F32
- *
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccccccccccc}
- * a00 & a01 & a02 & a03 & a10 & a11 & a12 & a13 & a20 & a21 & a22 & a23 & a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * @f]
- *
- * Following an example of how the transposition1xW works when the input data type is F16
- *
- * @f[
- * \left( \begin{array}{cccccccc}
- * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a7 \\
- * a10 & a11 & a12 & a13 & a14 & a15 & a16 & 17 \\
- * a20 & a21 & a22 & a23 & a24 & a25 & a26 & 27 \\
- * a30 & a31 & a32 & a33 & a34 & a35 & a36 & 37 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc}
- * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 & a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 & a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 & a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37\\
- * \end{array} \right)
- * @f]
- *
- * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) * mult_transpose1xW_width
- *
- */
-class CLGEMMTranspose1xWKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                   Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
-     * @param[out] output                  Output tensor. Data type supported: same as @p input
-     * @param[in]  mult_transpose1xW_width (Optional) Multiplication factor for the width of the 1xW transposed block
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, int mult_transpose1xW_width = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMTranspose1xWKernel
-     *
-     * @param[in] input                   Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
-     * @param[in] output                  Output tensor. Data type supported: same as @p input.
-     * @param[in] mult_transpose1xW_width Multiplication factor for the width of the 1xW transposed block
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int mult_transpose1xW_width);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLGEMMTRANSPOSE1XWKERNEL_H__ */
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 8e0173c492..83ab2d755a 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1665,8 +1665,8 @@ private:
 
 /** GEMM reshape information class. This class stores the necessary information about matrix A and matrix B reshape.
  *
- * The matrix A can only be reshaped through @ref CLGEMMInterleave4x4Kernel or  @ref NEGEMMInterleave4x4Kernel or  @ref GCGEMMInterleave4x4Kernel
- * Note: Optionally just for @ref CLGEMMInterleave4x4Kernel is it possible to set mult_interleave4x4_height, the multiplication factor for the height of the 4x4 interleaved block
+ * The matrix A can only be reshaped through @ref CLGEMMReshapeLHSMatrixKernel or  @ref NEGEMMInterleave4x4Kernel or  @ref GCGEMMInterleave4x4Kernel
+ * Note: Optionally just for @ref CLGEMMReshapeLHSMatrixKernel is it possible to set mult_interleave4x4_height, the multiplication factor for the height of the 4x4 interleaved block
  *
  * The matrix B can only be reshaped through @ref CLGEMMReshapeRHSMatrixKernel or  @ref NEGEMMTranspose1xWKernel or  @ref GCGEMMTranspose1xWKernel
  * Note: Optionally just for @ref CLGEMMReshapeRHSMatrixKernel is it possible to set mult_transpose1xW_width, the multiplication factor for the width of the 1xW transposed block
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index 129be4b307..e314f44370 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -78,10 +78,8 @@
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h"
 #include "arm_compute/runtime/CL/functions/CLGather.h"
 #include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
 #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
index b304576f33..e9a3f9bf2b 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
@@ -28,7 +28,6 @@
 
 #include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
 #include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
 #include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
@@ -120,7 +119,7 @@ public:
      *                          Data types supported: Same as @p input.
      * @param[in]  conv_info    Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. If this is not part of the fully connected layer the weights
-     *                          tensor has also been transposed with CLGEMMTranspose1xWKernel. Data type supported: Same as @p input.
+     *                          tensor has also been transposed with CLGEMMReshapeRHSMatrixKernel. Data type supported: Same as @p input.
      * @param[in]  dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation.
      * @param[in]  num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
@@ -139,7 +138,7 @@ public:
      *                          Data types supported: Same as @p input.
      * @param[in]  conv_info    Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. If this is not part of the fully connected layer the weights
-     *                          tensor has also been transposed with CLGEMMTranspose1xWKernel. Data type supported: Same as @p input.
+     *                          tensor has also been transposed with CLGEMMReshapeRHSMatrixKernel. Data type supported: Same as @p input.
      * @param[in]  dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation.
      * @param[in]  num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
diff --git a/arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h b/arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h
deleted file mode 100644
index 85f0d94f62..0000000000
--- a/arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_CLGEMMINTERLEAVE4X4_H__
-#define __ARM_COMPUTE_CLGEMMINTERLEAVE4X4_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute CLGEMMInterleave4x4Kernel. This function calls the following OpenCL kernel:
- *
- *  -# @ref CLGEMMInterleave4x4Kernel
- *
- * @deprecated This function is deprecated and will be removed in release 19.08
- *
- */
-class CLGEMMInterleave4x4 : public ICLSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input  First input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
-     * @param[out] output Output tensor. Data type supported: same as @p input
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-};
-}
-
-#endif /* __ARM_COMPUTE_CLGEMMINTERLEAVE4X4_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h b/arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h
deleted file mode 100644
index 8026abfd61..0000000000
--- a/arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_CLGEMMTRANSPOSE1XW_H__
-#define __ARM_COMPUTE_CLGEMMTRANSPOSE1XW_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-/** Basic function to execute CLGEMMTranspose1xWKernel. This function calls the following OpenCL kernels:
- *
- *  -# @ref CLGEMMTranspose1xWKernel
- *
- * @deprecated This function is deprecated and will be removed in release 19.08
- *
- */
-class CLGEMMTranspose1xW : public ICLSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input  First input tensor. Data type supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
-     * @param[out] output Output tensor. Data type supported: same as @p input
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-};
-}
-#endif /*__ARM_COMPUTE_CLGEMMTRANSPOSE1XW_H__ */
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index 57d8c4a3f2..03e889d14a 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -272,8 +272,8 @@ v19.05 Public major release
     - @ref NEDepthConcatenateLayer
     - @ref CLWidthConcatenateLayer
     - @ref CLDepthConcatenateLayer
-    - @ref CLGEMMInterleave4x4
-    - @ref CLGEMMTranspose1xW
+    - CLGEMMInterleave4x4
+    - CLGEMMTranspose1xW
  - Support different quantization info in CLConcatLayer.
  - Add checks on different input/output quantization info were not supported.
  - Tensors have different quantization information.
@@ -688,7 +688,7 @@ v17.09 Public major release
     - @ref CLDirectConvolutionLayerKernel / @ref CLDirectConvolutionLayer
     - @ref CLFlattenLayer
     - @ref CLFloorKernel / @ref CLFloor
-    - @ref CLGEMMTranspose1xW
+    - CLGEMMTranspose1xW
     - @ref CLGEMMMatrixVectorMultiplyKernel
     - @ref CLL2NormalizeLayerKernel / @ref CLL2NormalizeLayer
     - @ref CLQuantizationLayerKernel @ref CLMinMaxLayerKernel / @ref CLQuantizationLayer
@@ -766,7 +766,7 @@ v17.03.1 First Major public release of the sources
 v17.03 Sources preview
  - New OpenCL kernels / functions:
    - @ref CLGradientKernel, @ref CLEdgeNonMaxSuppressionKernel, @ref CLEdgeTraceKernel / @ref CLCannyEdge
-   - GEMM refactoring + FP16 support: @ref CLGEMMInterleave4x4Kernel, @ref CLGEMMTranspose1xWKernel, @ref CLGEMMMatrixMultiplyKernel, @ref CLGEMMMatrixAdditionKernel / @ref CLGEMM
+   - GEMM refactoring + FP16 support: CLGEMMInterleave4x4Kernel, CLGEMMTranspose1xWKernel, @ref CLGEMMMatrixMultiplyKernel, @ref CLGEMMMatrixAdditionKernel / @ref CLGEMM
    - @ref CLGEMMMatrixAccumulateBiasesKernel / @ref CLFullyConnectedLayer
    - @ref CLTransposeKernel / @ref CLTranspose
    - @ref CLLKTrackerInitKernel, @ref CLLKTrackerStage0Kernel, @ref CLLKTrackerStage1Kernel, @ref CLLKTrackerFinalizeKernel / @ref CLOpticalFlow
@@ -1232,20 +1232,20 @@ recommended to follow one of the options outlined below.
 
 @subsubsection S3_5_1_ubuntu_on_windows Bash on Ubuntu on Windows
 
-The best and easiest option is to use 
-<a href="https://msdn.microsoft.com/en-gb/commandline/wsl/about">Ubuntu on Windows</a>. 
+The best and easiest option is to use
+<a href="https://msdn.microsoft.com/en-gb/commandline/wsl/about">Ubuntu on Windows</a>.
 This feature is still marked as *beta* and thus might not be available.
 However, if it is building the library is as simple as opening a *Bash on
 Ubuntu on Windows* shell and following the general guidelines given above.
 
 @subsubsection S3_5_2_cygwin Cygwin
 
-If the Windows subsystem for Linux is not available <a href="https://www.cygwin.com/">Cygwin</a> 
+If the Windows subsystem for Linux is not available <a href="https://www.cygwin.com/">Cygwin</a>
 can be used to install and run `scons`. In addition to the default packages
 installed by Cygwin `scons` has to be selected in the installer. (`git` might
 also be useful but is not strictly required if you already have got the source
-code of the library.) Linaro provides pre-built versions of 
-<a href="http://releases.linaro.org/components/toolchain/binaries/">GCC cross-compilers</a> 
+code of the library.) Linaro provides pre-built versions of
+<a href="http://releases.linaro.org/components/toolchain/binaries/">GCC cross-compilers</a>
 that can be used from the Cygwin terminal. When building for Android the
 compiler is included in the Android standalone toolchain. After everything has
 been set up in the Cygwin terminal the general guide on building the library
diff --git a/docs/05_functions_list.dox b/docs/05_functions_list.dox
index 7d6728d427..9a5c8c0027 100644
--- a/docs/05_functions_list.dox
+++ b/docs/05_functions_list.dox
@@ -285,11 +285,9 @@ namespace arm_compute
         - @ref CLFullyConnectedLayerReshapeWeights
         - @ref CLGather
         - @ref CLGaussian3x3
-        - @ref CLGEMMInterleave4x4
         - @ref CLGEMMLowpQuantizeDownInt32ToUint8Scale
         - @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
         - @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat
-        - @ref CLGEMMTranspose1xW
         - @ref CLMagnitude
         - @ref CLMedian3x3
         - @ref CLNonLinearFilter
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index dfcbfa7cc5..28152168bc 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -299,7 +299,6 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "gaussian1x5_sub_x", "gaussian_pyramid.cl" },
     { "gaussian5x1_sub_y", "gaussian_pyramid.cl" },
     { "gemm_accumulate_biases", "gemm.cl" },
-    { "gemm_interleave4x4", "gemm.cl" },
     { "gemm_ma_f16", "gemm.cl" },
     { "gemm_ma_f32", "gemm.cl" },
     { "gemm_mv", "gemv.cl" },
@@ -319,7 +318,6 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
     { "gemm_mm_reshaped_only_rhs_nt", "gemm.cl" },
     { "gemm_mm_reshaped_only_rhs_t", "gemm.cl" },
     { "gemm_lc_vm_f32", "gemm.cl" },
-    { "gemm_transpose1xW", "gemm.cl" },
     { "gemm_reshape_lhs_matrix_nt", "gemm.cl" },
     { "gemm_reshape_lhs_matrix_t", "gemm.cl" },
     { "gemm_reshape_rhs_matrix_nt", "gemm.cl" },
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index da45d0fc18..41e5c338b3 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -2205,187 +2205,6 @@ __kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
 }
 #endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
 
-#if defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH)
-
-#if ELEMENT_SIZE == 1
-#define DATA_TYPE uchar
-#elif ELEMENT_SIZE == 2
-#define DATA_TYPE ushort
-#elif ELEMENT_SIZE == 4
-#define DATA_TYPE uint
-#else // ELEMENT_SIZE == 1
-#error "Element size not supported"
-#endif // ELEMENT_SIZE
-
-/** This OpenCL kernel computes the "vector" 1xW transposition of input matrix
- *
- * @note The transposition width must be passed at compile time using -DTRANSPOSE_W (i.e. -DTRANSPOSE_W)
- * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_transpose1xW(TENSOR3D_DECLARATION(src),
-                                TENSOR3D_DECLARATION(dst))
-{
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    // Compute address for Matrix B - source
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-
-    // Compute address for Matrix B transposed - destination. X and Y are swapped
-    uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + y * TRANSPOSE_W * sizeof(DATA_TYPE) * MULT_TRANSPOSE1XW_WIDTH + (x / MULT_TRANSPOSE1XW_WIDTH) * dst_stride_y +
-                             (x % MULT_TRANSPOSE1XW_WIDTH) * TRANSPOSE_W * sizeof(DATA_TYPE);
-
-    // Add offset for batched GEMM
-    dst_addr_in_bytes += z * dst_stride_z;
-
-    VEC_DATA_TYPE(DATA_TYPE, TRANSPOSE_W)
-    b0 = VLOAD(TRANSPOSE_W)(0, (__global DATA_TYPE *)src.ptr);
-
-    VSTORE(TRANSPOSE_W)
-    (b0, 0, (__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes));
-}
-#endif // defined(TRANSPOSE_W) && defined(MULT_TRANSPOSE1XW_WIDTH)
-
-#if defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)
-
-/** This OpenCL kernel reshapes the input matrix transposing each 4x4 block. If -DUNROLL_BLOCK is passed at compile time, the 4x4 block
- * will be simply unrolled.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=float)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
- * @note In case the input has to be reinterpreted as a 3D tensor (i.e. input of convolution layer 1x1), the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in]  cross_plane_pad                   (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- */
-__kernel void gemm_interleave4x4(TENSOR3D_DECLARATION(src),
-                                 TENSOR3D_DECLARATION(dst)
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                 ,
-                                 uint cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-                                )
-{
-    // Compute source and destination addresses
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    // Compute address for source tensor
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-
-    // Compute address for Matrix B transposed - destination. X and Y are swapped
-    uint dst_addr_in_bytes = dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * 16 * MULT_INTERLEAVE4X4_HEIGHT + (y / MULT_INTERLEAVE4X4_HEIGHT) * dst_stride_y +
-                             (y % MULT_INTERLEAVE4X4_HEIGHT) * 4 * sizeof(DATA_TYPE);
-
-    // Add offset for batched GEMM
-    dst_addr_in_bytes += z * dst_stride_z;
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * 4 * sizeof(DATA_TYPE) + y * 4 * src_stride_y;
-
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zin) is calculated dividing M (y * 4) by HEIGHT_GEMM3D
-    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(y * 4)) / (uint4)HEIGHT_GEMM3D;
-    zin       = min(DEPTH_GEMM3D - 1, zin);
-
-    // Add offset due to the cross plane paddings
-    zin *= (cross_plane_pad * src_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src_stride_z by DEPTH_GEMM3D
-    input_ptr += z * src_stride_z * DEPTH_GEMM3D;
-
-    // Load values from Matrix A
-    LOAD_BLOCK(4, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin.s);
-
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-    __global uchar *input_ptr = src.ptr;
-
-    // Load values from Matrix A
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    a0 = vload4(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    a1 = vload4(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    a2 = vload4(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    a3 = vload4(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(UNROLL_BLOCK)
-    vstore4(a0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT));
-    vstore4(a1, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 4 * MULT_INTERLEAVE4X4_HEIGHT));
-    vstore4(a2, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 8 * MULT_INTERLEAVE4X4_HEIGHT));
-    vstore4(a3, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT));
-#else  // defined(UNROLL_BLOCK)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s0, a1.s0, a2.s0, a3.s0);
-    vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 0 * MULT_INTERLEAVE4X4_HEIGHT));
-
-    val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s1, a1.s1, a2.s1, a3.s1);
-    vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 4 * MULT_INTERLEAVE4X4_HEIGHT));
-
-    val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s2, a1.s2, a2.s2, a3.s2);
-    vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 8 * MULT_INTERLEAVE4X4_HEIGHT));
-
-    val0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s3, a1.s3, a2.s3, a3.s3);
-    vstore4(val0, 0, ((__global DATA_TYPE *)(dst_ptr + dst_addr_in_bytes) + 12 * MULT_INTERLEAVE4X4_HEIGHT));
-#endif // defined(UNROLL_BLOCK)
-}
-#endif // defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(DATA_TYPE)
-
 #if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
 /** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
diff --git a/src/core/CL/cl_kernels/gemm_helpers.h b/src/core/CL/cl_kernels/gemm_helpers.h
index 5bc897b859..c9e548afb8 100644
--- a/src/core/CL/cl_kernels/gemm_helpers.h
+++ b/src/core/CL/cl_kernels/gemm_helpers.h
@@ -154,7 +154,7 @@
 
 #define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
     CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)     \
-    Z##7 = (1 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                              \
+    Z##7 = (7 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                              \
     Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7);                                                     \
     Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
 
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index 033b4b4942..18ccb65aaf 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -35,7 +35,7 @@
 
 #if defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)
 /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- *  Matrix A and matrix B must be reshaped respectively with @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel before running the matrix multiplication
+ *  Matrix A and matrix B must be reshaped respectively with @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel before running the matrix multiplication
  *
  * @note The number of matrix B columns needs to be passed at compile time using -DCOLS_B: e.g. -DCOLS_B=1024
  * @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)
@@ -195,7 +195,7 @@ __kernel void gemmlowp_mm_interleaved_transposed_midgard(IMAGE_DECLARATION(src0)
 }
 
 /** This OpenCL kernel is optimized for Bifrost and computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- *  Matrix A and matrix B must be reshaped respectively with @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel before running the matrix multiplication
+ *  Matrix A and matrix B must be reshaped respectively with @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel before running the matrix multiplication
  *
  * @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B
  * @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)
@@ -554,7 +554,7 @@ __kernel void gemmlowp_mm_interleaved_transposed_bifrost(IMAGE_DECLARATION(src0)
 
 #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
 /** This OpenCL kernel is optimized for Bifrost and computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- *  Matrix A and matrix B must be reshaped respectively with @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel before running the matrix multiplication
+ *  Matrix A and matrix B must be reshaped respectively with @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel before running the matrix multiplication
  *
  * @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B
  * @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)
diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
deleted file mode 100644
index 0857702eee..0000000000
--- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height, bool reinterpret_input_as_3d)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON(mult_interleave4x4_height < 1);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
-                                                         DataType::U16, DataType::S16, DataType::U32, DataType::S32,
-                                                         DataType::F16, DataType::F32);
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_interleaved_shape(*input, mult_interleave4x4_height, reinterpret_input_as_3d));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, int mult_interleave4x4_height, bool reinterpret_input_as_3d)
-{
-    constexpr unsigned int num_elems_processed_per_iteration_x = 4;
-    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-    const unsigned int     num_elems_written_per_iteration     = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y * mult_interleave4x4_height;
-    bool                   window_changed                      = false;
-
-    TensorInfo tmp_info(*input);
-
-    if(reinterpret_input_as_3d)
-    {
-        // Since the input tensor has to be reinterpreted as 3D and the execute window is based on a 2D interleave,
-        // the window needs to be constructed on the 2D collapsed version of the tensor
-        TensorShape tmp_shape(input->tensor_shape());
-        tmp_shape.collapse(2U, 1U);
-        tmp_info.set_tensor_shape(tmp_shape);
-    }
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_interleaved_shape(*input, mult_interleave4x4_height)));
-
-    // Configure window
-    const float scale_x = 4.0f * static_cast<float>(mult_interleave4x4_height);
-    const float scale_y = 1.0f / (scale_x);
-
-    // Note: bottom paddings are calculated manually as the input can be reinterpreted as 3D tensor
-    // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
-    const int m          = reinterpret_input_as_3d ? input->tensor_shape()[1] * input->tensor_shape()[2] : input->tensor_shape()[1];
-    const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
-
-    Window win    = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    Window win_in = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    AccessWindowStatic input_access(input, 0, 0,
-                                    ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration_x),
-                                    input->dimension(1) + bottom_pad);
-    AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration, 1, scale_x, scale_y);
-
-    window_changed = update_window_and_padding(win_in, input_access) || // window used by the execute_window_loop
-                     update_window_and_padding(win, output_access);     // window used to update the padding requirements of output tensor
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
-
-    // Collapse along the Z direction
-    // This collapse needs to be here in order to tune the Z dimension of LWS
-    Window collapsed = win.collapse(win, Window::DimZ);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMInterleave4x4Kernel::CLGEMMInterleave4x4Kernel()
-    : _input(nullptr), _output(nullptr), _reinterpret_input_as_3d(false)
-{
-}
-
-void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height, bool reinterpret_input_as_3d, bool unroll_block)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_interleaved_shape(*input->info(), mult_interleave4x4_height, reinterpret_input_as_3d)));
-
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mult_interleave4x4_height, reinterpret_input_as_3d));
-
-    _input                   = input;
-    _output                  = output;
-    _reinterpret_input_as_3d = reinterpret_input_as_3d;
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));
-    build_opts.add_option_if(unroll_block, "-DUNROLL_BLOCK");
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(1)));
-    build_opts.add_option_if(_reinterpret_input_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(2)));
-
-    switch(input->info()->element_size())
-    {
-        case 1:
-            build_opts.add_option("-DDATA_TYPE=uchar");
-            break;
-        case 2:
-            build_opts.add_option("-DDATA_TYPE=ushort");
-            break;
-        case 4:
-            build_opts.add_option("-DDATA_TYPE=uint");
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Data type not supported");
-    }
-
-    // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_interleave4x4", build_opts.options()));
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), mult_interleave4x4_height, reinterpret_input_as_3d);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // Set config_id for enabling LWS tuning
-    _config_id = "interleave4x4_";
-    _config_id += (_reinterpret_input_as_3d ? "3d_" : "");
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(0));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(1));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(2));
-    _config_id += "_";
-    _config_id += support::cpp11::to_string(output->info()->dimension(3));
-}
-
-Status CLGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height, bool reinterpret_input_as_3d)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mult_interleave4x4_height, reinterpret_input_as_3d));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), mult_interleave4x4_height, reinterpret_input_as_3d).first);
-
-    return Status{};
-}
-
-void CLGEMMInterleave4x4Kernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    /*
-     * This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
-     *         |a00 a01 a02 a03|
-     *         |a10 a11 a12 a13|
-     *         |a20 a21 a22 a23| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 |
-     *         |a30 a31 a32 a33|
-     *
-     * After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ]
-     */
-    Window slice = window.first_slice_window_3D();
-
-    if(_reinterpret_input_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 2 * num_arguments_per_3D_tensor();
-        const unsigned int total_cross_plane_pad = _input->info()->padding().top + _input->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
-}
diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
deleted file mode 100644
index 986a009805..0000000000
--- a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include <cmath>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int mult_transpose1xW_width)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON(mult_transpose1xW_width < 1);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
-                                                         DataType::U16, DataType::S16, DataType::U32, DataType::S32,
-                                                         DataType::F16, DataType::F32);
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
-                                                           compute_transpose1xW_with_element_size_shape(*input, mult_transpose1xW_width));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration, int mult_transpose1xW_width)
-{
-    num_elems_processed_per_iteration = 16 / input->element_size();
-
-    const int scale_x        = num_elems_processed_per_iteration * mult_transpose1xW_width;
-    bool      window_changed = false;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-
-    // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*input, mult_transpose1xW_width)));
-
-    // Configure window in case of configured output
-    AccessWindowStatic output_access(output, 0, 0, ceil_to_multiple(output->dimension(0), scale_x), output->dimension(1));
-    window_changed = window_changed || update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->tensor_shape()));
-
-    // Collapse along the Z direction
-    Window collapsed = win.collapse(win, Window::DimZ);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, collapsed);
-}
-} // namespace
-
-void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output, int mult_transpose1xW_width)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*input->info(), mult_transpose1xW_width)));
-
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mult_transpose1xW_width));
-
-    _input  = input;
-    _output = output;
-
-    // Configure kernel window
-    // Note: num_elems_processed_per_iteration will be set in validate_and_configure_window()
-    unsigned int num_elems_processed_per_iteration = 1;
-    auto         win_config                        = validate_and_configure_window(input->info(), output->info(), num_elems_processed_per_iteration, mult_transpose1xW_width);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // Create build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
-    build_opts.add_option("-DTRANSPOSE_W=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width));
-
-    /*
-     * Following an example of how the transposition1xW works when the input data type is F32
-     *
-     *         |a00 a01 a02 a03|
-     *         |a10 a11 a12 a13|
-     *         |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 |
-     *         |a30 a31 a32 a33|
-     *
-     * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) * mult_transpose1xW_width
-     */
-    // Create kernel
-    std::string kernel_name = "gemm_transpose1xW";
-    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
-}
-
-Status CLGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output, int mult_transpose1xW_width)
-{
-    unsigned int num_elems_processed_per_iteration = 1;
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mult_transpose1xW_width));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration, mult_transpose1xW_width).first);
-
-    return Status{};
-}
-
-void CLGEMMTranspose1xWKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-    // Output is transposed
-    Window out_window(window);
-    out_window.set(Window::DimX, window.y());
-    out_window.set(Window::DimY, window.x());
-
-    Window in_slice  = window.first_slice_window_3D();
-    Window out_slice = out_window.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, in_slice);
-        add_3D_tensor_argument(idx, _output, out_slice);
-        enqueue(queue, *this, in_slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_3D(out_slice));
-}
diff --git a/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp b/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
deleted file mode 100644
index 45547e4cb6..0000000000
--- a/src/runtime/CL/functions/CLGEMMInterleave4x4.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h"
-
-#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-void CLGEMMInterleave4x4::configure(const ICLTensor *input, ICLTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLGEMMInterleave4x4Kernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/CL/functions/CLGEMMTranspose1xW.cpp b/src/runtime/CL/functions/CLGEMMTranspose1xW.cpp
deleted file mode 100644
index d054e01611..0000000000
--- a/src/runtime/CL/functions/CLGEMMTranspose1xW.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/Types.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-void CLGEMMTranspose1xW::configure(const ICLTensor *input, ICLTensor *output)
-{
-    auto k = arm_compute::support::cpp14::make_unique<CLGEMMTranspose1xWKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
-}
-\ No newline at end of file
diff --git a/tests/validation/CL/GEMM.cpp b/tests/validation/CL/GEMM.cpp
index 2bebc90d7f..8fce006dcb 100644
--- a/tests/validation/CL/GEMM.cpp
+++ b/tests/validation/CL/GEMM.cpp
@@ -21,8 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
@@ -38,8 +36,6 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/GEMMFixture.h"
-#include "tests/validation/fixtures/GEMMInterleave4x4Fixture.h"
-#include "tests/validation/fixtures/GEMMTranspose1xWFixture.h"
 
 namespace arm_compute
 {
@@ -69,20 +65,6 @@ const auto data_transpose = framework::dataset::make("M", 8, 14) * framework::da
 TEST_SUITE(CL)
 TEST_SUITE(GEMM)
 
-TEST_SUITE(INTERLEAVE_4X4)
-using CLGEMMInterleave4x4 = CLSynthetizeFunctionWithZeroConstantBorder<CLGEMMInterleave4x4Kernel, 4>;
-
-TEST_SUITE(FP32)
-using CLGEMMInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<CLTensor, CLAccessor, CLGEMMInterleave4x4, float>;
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::F32))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-TEST_SUITE_END() // FP32
-
-TEST_SUITE_END() // INTERLEAVE_4X4
-
 template <typename T>
 using CLGEMMFixture = GEMMValidationFixture<CLTensor, CLAccessor, CLGEMM, T>;
 
@@ -92,19 +74,6 @@ using CLGEMMOutput3DFixture = GEMMValidationFixture<CLTensor, CLAccessor, CLGEMM
 template <typename T>
 using CLGEMMInputOutput3DFixture = GEMMValidationFixture<CLTensor, CLAccessor, CLGEMM, T, false, true, true>;
 
-TEST_SUITE(TRANSPOSE_1XW)
-using CLGEMMTranspose1xW        = CLSynthetizeFunctionWithZeroConstantBorder<CLGEMMTranspose1xWKernel, 4>;
-using CLGEMMTranspose1xWFixture = GEMMTranspose1xWValidationFixture<CLTensor, CLAccessor, CLGEMMTranspose1xW, float>;
-TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::F32))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference);
-}
-TEST_SUITE_END() // FP32
-
-TEST_SUITE_END() //TRANSPOSE_1XW
-
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallGEMMDataset(),