7 files changed, 149 insertions, 40 deletions
diff --git a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
index 2520eff5de..c0fef45afe 100644
--- a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,7 +47,7 @@ class ICLTensor;
  * \end{array} \right)
  * @f]
  *
- * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
+ * After this operation, the output matrix will have the following shape: [ height * W, ceil(width / W) ] where W = 4 * mult_interleave4x4_height
  */
 class CLGEMMInterleave4x4Kernel : public ICLKernel
 {
@@ -64,18 +64,20 @@ public:
     CLGEMMInterleave4x4Kernel &operator=(CLGEMMInterleave4x4Kernel &&) = default;
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
-     * @param[out] output Output tensor. Data type supported: same as @p input
+     * @param[in]  input                     Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[out] output                    Output tensor. Data type supported: same as @p input
+     * @param[in]  mult_interleave4x4_height (Optional) Multiplication factor for the height of the 4x4 interleave block
      */
-    void configure(const ICLTensor *input, ICLTensor *output);
+    void configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMInterleave4x4Kernel
      *
-     * @param[in] input  Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
-     * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
+     * @param[in] input                     Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in] output                    Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
+     * @param[in] mult_interleave4x4_height Multiplication factor for the height of the 4x4 interleave block
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int mult_interleave4x4_height);
 
     // Inherited methods overridden
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
index 4e73d7eb13..7260c4a4f6 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,8 +58,10 @@ public:
      * @param[out] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
      * @param[in]  alpha                     Weight of the matrix product
      * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel
+     * @param[in]  reshape_info              (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
+     *
      */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed = true);
+    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyKernel
      *
      * @param[in] input0                    Input tensor containing the Matrix A. Data types supported: QS8/QS16/F16/F32
@@ -67,11 +69,13 @@ public:
      * @param[in] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
      * @param[in] alpha                     Weight of the matrix product
      * @param[in] is_interleaved_transposed True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel
+     * @param[in] reshape_info              GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
      * @param[in] gpu_target                GPU Target
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, GPUTarget gpu_target);
+    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
+                           GPUTarget gpu_target);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
index 8721643c1e..9a3069eab6 100644
--- a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,7 +62,7 @@ class ICLTensor;
  * \end{array} \right)
  * @f]
  *
- * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
+ * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) * mult_transpose1xW_width
  *
  */
 class CLGEMMTranspose1xWKernel : public ICLSimple2DKernel
@@ -70,18 +70,20 @@ class CLGEMMTranspose1xWKernel : public ICLSimple2DKernel
 public:
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
-     * @param[out] output Output tensor. Data type supported: same as @p input
+     * @param[in]  input                   Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[out] output                  Output tensor. Data type supported: same as @p input
+     * @param[in]  mult_transpose1xW_width (Optional) Multiplication factor for the width of the 1xW transposed block
      */
-    void configure(const ICLTensor *input, ICLTensor *output);
+    void configure(const ICLTensor *input, ICLTensor *output, int mult_transpose1xW_width = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMTranspose1xWKernel
      *
-     * @param[in] input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
-     * @param[in] output Output tensor. Data type supported: same as @p input.
+     * @param[in] input                   Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in] output                  Output tensor. Data type supported: same as @p input.
+     * @param[in] mult_transpose1xW_width Multiplication factor for the width of the 1xW transposed block
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int mult_transpose1xW_width);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 5402e358b5..5197000bf9 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2018 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -824,13 +824,95 @@ private:
     const unsigned int _num_kernels;
 };
 
-/** GEMM Information class. This class stores the necessary information to compute GEMM functions */
+/** GEMM reshape information class. This class stores the necessary information about matrix A and matrix B reshape.
+ *
+ * The matrix A can only be reshaped through @ref CLGEMMInterleave4x4Kernel or  @ref NEGEMMInterleave4x4Kernel or  @ref GCGEMMInterleave4x4Kernel
+ * Note: Optionally just for @ref CLGEMMInterleave4x4Kernel is it possible to set mult_interleave4x4_height, the multiplication factor for the height of the 4x4 interleaved block
+ *
+ * The matrix B can only be reshaped through @ref CLGEMMTranspose1xWKernel or  @ref NEGEMMTranspose1xWKernel or  @ref GCGEMMTranspose1xWKernel
+ * Note: Optionally just for @ref CLGEMMTranspose1xWKernel is it possible to set mult_transpose1xW_width, the multiplication factor for the width of the 1xW transposed block
+ *
+ */
+class GEMMReshapeInfo final
+{
+public:
+    /** Default constructor */
+    GEMMReshapeInfo()
+        : _m(1), _n(1), _k(1), _mult_transpose1xW_width(1), _mult_interleave4x4_height(1)
+    {
+    }
+    /** Constructor
+     *
+     * @param[in] m                         Number of matrix A rows
+     * @param[in] n                         Number of matrix B columns
+     * @param[in] k                         Number of matrix A columns or matrix B rows
+     * @param[in] mult_transpose1xW_width   (Optional) Multiplication factor for the width of the 1xW transposed block
+     * @param[in] mult_interleave4x4_height (Optional) Multiplication factor for the height of the 4x4 interleaved block
+     */
+    GEMMReshapeInfo(int m, int n, int k, int mult_transpose1xW_width = 1, int mult_interleave4x4_height = 1)
+        : _m(m), _n(n), _k(k), _mult_transpose1xW_width(mult_transpose1xW_width), _mult_interleave4x4_height(mult_interleave4x4_height)
+    {
+    }
+    /** Number of matrix A rows
+     *
+     * @return the number of matrix A rows
+     */
+    int m() const
+    {
+        return _m;
+    }
+    /** Number of matrix B columns
+     *
+     * @return the number of matrix B columns
+     */
+    int n() const
+    {
+        return _n;
+    }
+    /** Number of matrix A columns or matrix B rows
+     *
+     * @return the number of matrix A columns or matrix B rows
+     */
+    int k() const
+    {
+        return _k;
+    }
+    /** Multiplication factor for the width of the 1xW transposed block
+     *
+     * @return the multiplication factor for the width of the 1xW transposed block
+     */
+    int mult_transpose1xW_width() const
+    {
+        return _mult_transpose1xW_width;
+    }
+    /** Multiplication factor for the height of the 4x4 interleaved block
+     *
+     * @return the multiplication factor for the height of the 4x4 interleaved block
+     */
+    int mult_interleave4x4_height() const
+    {
+        return _mult_interleave4x4_height;
+    }
+
+private:
+    const int _m;
+    const int _n;
+    const int _k;
+    const int _mult_transpose1xW_width;
+    const int _mult_interleave4x4_height;
+};
+
+/** GEMM information class. This class stores the necessary information to compute GEMM functions
+ *
+ * This object also contains the information about how matrix A and matrix B have been reshaped
+ *
+ */
 class GEMMInfo
 {
 public:
     /** Default constructor */
     GEMMInfo()
-        : _is_a_reshaped(false), _is_b_reshaped(false), _reshape_b_only_on_first_run(false)
+        : _is_a_reshaped(false), _is_b_reshaped(false), _reshape_b_only_on_first_run(false), _reshape_info()
     {
     }
     /** Constructor
@@ -838,9 +920,10 @@ public:
      * @param[in] is_a_reshaped               True if the matrix A has been reshaped
      * @param[in] is_b_reshaped               True if the matrix B has been reshaped
      * @param[in] reshape_b_only_on_first_run Reshape matrix B only for the first run
+     * @param[in] reshape_info                (Optional) GEMM reshape information object
      */
-    GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run)
-        : _is_a_reshaped(is_a_reshaped), _is_b_reshaped(is_b_reshaped), _reshape_b_only_on_first_run(reshape_b_only_on_first_run)
+    GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo())
+        : _is_a_reshaped(is_a_reshaped), _is_b_reshaped(is_b_reshaped), _reshape_b_only_on_first_run(reshape_b_only_on_first_run), _reshape_info(reshape_info)
     {
     }
     /** Flag which specifies if the matrix A has been reshaped
@@ -869,11 +952,20 @@ public:
     {
         return _reshape_b_only_on_first_run;
     };
+    /** GEMMReshapeInfo object which stores the necessary information to understand how the matrix A and matrix B have been reshaped
+     *
+     * @return the GEMMReshapeInfo object
+     */
+    const GEMMReshapeInfo &reshape_info() const
+    {
+        return _reshape_info;
+    }
 
 private:
-    const bool _is_a_reshaped;
-    const bool _is_b_reshaped;
-    const bool _reshape_b_only_on_first_run;
+    const bool      _is_a_reshaped;
+    const bool      _is_b_reshaped;
+    const bool      _reshape_b_only_on_first_run;
+    GEMMReshapeInfo _reshape_info;
 };
 
 /** IO formatting information class*/
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 61834b88a9..6ecfdf0323 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,12 +39,14 @@ inline TensorShape compute_permutation_output_shape(const ITensorInfo &input, co
     permute(output_shape, perm);
     return output_shape;
 }
-inline TensorShape compute_interleaved_shape(const ITensorInfo &a)
+inline TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_interleave4x4_height = 1)
 {
-    // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+    // The interleaved output matrix will have the following shape: [ a_height * W, ceil(a_width / W) ] where W = 4 * mult_interleave4x4_height
+    ARM_COMPUTE_ERROR_ON(mult_interleave4x4_height < 1);
+    const int   interleave_width = 4 * mult_interleave4x4_height;
     TensorShape shape_interleaved_a{ a.tensor_shape() };
-    shape_interleaved_a.set(0, a.dimension(0) * 4);
-    shape_interleaved_a.set(1, std::ceil(a.dimension(1) / 4.f));
+    shape_interleaved_a.set(0, a.dimension(0) * interleave_width);
+    shape_interleaved_a.set(1, std::ceil(a.dimension(1) / static_cast<float>(interleave_width)));
 
     return shape_interleaved_a;
 }
@@ -57,12 +59,14 @@ inline TensorShape compute_transpose1xW_shape(const ITensorInfo &b)
 
     return shape_transposed1xW_b;
 }
-inline TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInfo &b)
+inline TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInfo &b, int mult_transpose1xW_width = 1)
 {
-    // The transpose1xW output matrix will have the following shape:
-    // [ b_height * (16 / element_size), ceil(b_width / (16.0f / element_size) ]
+    // Note: mult_transpose1xW_width expresses the number of chunks with size 1x(W) we want to store on the same row
+    //       The transpose1xW output matrix will have the following shape:
+    //       [ b_height * W, ceil(b_width / W) ] where W = (16 / element size of the tensor) * mult_transpose1xW_width
+    ARM_COMPUTE_ERROR_ON(mult_transpose1xW_width < 1);
     TensorShape  shape_transposed1xW_b{ b.tensor_shape() };
-    const size_t transpose_width = 16 / b.element_size();
+    const size_t transpose_width = (16 / b.element_size()) * mult_transpose1xW_width;
     shape_transposed1xW_b.set(0, b.dimension(1) * transpose_width);
     shape_transposed1xW_b.set(1, static_cast<size_t>(std::ceil(b.dimension(0) / static_cast<float>(transpose_width))));
 
diff --git a/arm_compute/graph/Graph.h b/arm_compute/graph/Graph.h
index ab1d8b8866..853b90df82 100644
--- a/arm_compute/graph/Graph.h
+++ b/arm_compute/graph/Graph.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef __ARM_COMPUTE_GRAPH_GRAPH_H__
 #define __ARM_COMPUTE_GRAPH_GRAPH_H__
 
+#include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/graph/INode.h"
 #include "arm_compute/graph/ITensorObject.h"
 #include "arm_compute/graph/SubTensor.h"
@@ -67,9 +68,12 @@ public:
      * @param[in] tensor Tensor to add
      */
     void add_tensor_object(std::unique_ptr<ITensorObject> tensor);
-    /** Finalizes the current node's configuration
+    /** Check if the OpenCL target is available
      */
     static bool opencl_is_available();
+    /** Returns the GPU target
+     */
+    static GPUTarget gpu_target();
     /** Manually sets the output of the current node
      *
      * @param[in] tmp Output info to set
diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h
index bf41226bda..0f144915d7 100644
--- a/arm_compute/runtime/CL/functions/CLGEMM.h
+++ b/arm_compute/runtime/CL/functions/CLGEMM.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,7 +68,8 @@ public:
      * @param[in]  alpha     Weight of the matrix product
      * @param[in]  beta      Weight of matrix C
      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                       if the reshape of matrix B should happen only for the first run
+     *                       if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping
+     *                       in case matrix A and matrix B have been already transformed.
      */
     void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());