From 088d63aae947efd8bbcfd4d27c1f50a6af79e3b9 Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Tue, 11 Aug 2020 14:14:06 +0100
Subject: COMPMID-3337: Remove write paddings in both axes from
 CLGEMMMatrixMultiplyReshapedKernel

- Change the interface of STORE_BLOCK_BOUNDARY_AWARE passing the
  conditions on Y and X rather than the X/ coordinates. This allows to
  use the macro with both GEMM reshaped and GEMM reshaped rhs only
- Remove padding from the output tensor of
  CLGEMMMatrixMultiplyReshapedKernel
- Add tests for validating the zero padding requirement

Change-Id: I13263cc71ce065c5be34ed198def320dd5823495
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3712
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: SiCong Li <sicong.li@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 src/core/CL/cl_kernels/gemm.cl                     | 68 +++++++++++----
 src/core/CL/cl_kernels/gemm_helpers.h              | 98 +++++++++++-----------
 .../kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp | 21 +++--
 .../CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp  |  2 +-
 tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp | 90 ++++++++++++++++++++
 5 files changed, 204 insertions(+), 75 deletions(-)

diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index 8f5f8e3d07..4ad22ec830 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -1278,8 +1278,11 @@ __kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
     ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
 #endif // defined(ACTIVATION_TYPE)
 
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
     // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
 
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
@@ -1621,8 +1624,11 @@ __kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
     ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
 #endif // defined(ACTIVATION_TYPE)
 
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
     // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
 
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
@@ -2014,8 +2020,11 @@ __kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
     ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
 #endif // defined(ACTIVATION_TYPE)
 
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
     // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
 
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
@@ -2320,8 +2329,11 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
     ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
 #endif // defined(ACTIVATION_TYPE)
 
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
     // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
 
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
@@ -2519,6 +2531,8 @@ __kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
  * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
  * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
  * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
  * @note Only the following configurations of M0, N0 and K0 are currently supported:
  *  - M0 = 2, 3, 4, 5, 6, 7, 8
  *  - N0 = 2, 3, 4, 8, 16
@@ -2755,11 +2769,15 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
 #endif // defined(MIXED_PRECISION)
 #endif // defined(ACTIVATION_TYPE)
 
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
     // Store output block
 #if defined(MIXED_PRECISION)
-    CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
 #else  // defined(MIXED_PRECISION)
-    STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
 #endif // defined(MIXED_PRECISION)
 
 #undef LHS_BLOCK_SIZE
@@ -2791,6 +2809,8 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
  * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
  * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
  * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
  * @note Only the following configurations of M0, N0 and K0 are currently supported:
  *  - M0 = 2, 3, 4, 5, 6, 7, 8
  *  - N0 = 4, 8, 16
@@ -3019,11 +3039,15 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
 #endif // defined(MIXED_PRECISION)
 #endif // defined(ACTIVATION_TYPE)
 
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
     // Store output block
 #if defined(MIXED_PRECISION)
-    CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
 #else  // defined(MIXED_PRECISION)
-    STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
 #endif // defined(MIXED_PRECISION)
 
 #undef LHS_BLOCK_SIZE
@@ -3160,6 +3184,8 @@ __kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
  * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
  * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
  * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
  * @note Only the following configurations of M0, N0 and K0 are currently supported:
  *  - M0 = 2, 3, 4, 8
  *  - N0 = 2, 3, 4, 8, 16
@@ -3480,7 +3506,8 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
 #endif // defined(MIXED_PRECISION)
 
 #else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
+                                    2) * bias_stride_z;
 
     LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
 
@@ -3506,11 +3533,15 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
 #endif // defined(MIXED_PRECISION)
 #endif // defined(ACTIVATION_TYPE)
 
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
     // Store output block
 #if defined(MIXED_PRECISION)
-    CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
 #else  // defined(MIXED_PRECISION)
-    STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
 #endif // defined(MIXED_PRECISION)
 
 #undef LHS_BLOCK_SIZE
@@ -3538,6 +3569,8 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
  * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
  * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
  * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
  * @note Only the following configurations of M0, N0 and K0 are currently supported:
  *  - M0 = 2, 3, 4, 8
  *  - N0 = 4, 8, 16
@@ -3867,11 +3900,15 @@ __kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
 #endif // defined(MIXED_PRECISION)
 #endif // defined(ACTIVATION_TYPE)
 
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
     // Store output block
 #if defined(MIXED_PRECISION)
-    CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
 #else  // defined(MIXED_PRECISION)
-    STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
 #endif // defined(MIXED_PRECISION)
 
 #undef LHS_BLOCK_SIZE
@@ -4246,8 +4283,11 @@ __kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
     ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
 #endif // defined(ACTIVATION_TYPE)
 
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
     // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
 
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
diff --git a/src/core/CL/cl_kernels/gemm_helpers.h b/src/core/CL/cl_kernels/gemm_helpers.h
index fada0302ff..6f6edc1bcf 100644
--- a/src/core/CL/cl_kernels/gemm_helpers.h
+++ b/src/core/CL/cl_kernels/gemm_helpers.h
@@ -1050,27 +1050,25 @@
  * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
  * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
  * @param[in] N                Total number of columns. Used to detect if current block is at the boundary in x.
- * @param[in] y                Global id of current block in y. Used to detect if current block is at the boundary in y.
- * @param[in] x                Global id of current block in x. Used to detect if current block is at the boundary in x.
+ * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
+ * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
  */
-#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x) \
-    bool at_y_boundary = y == 0;                                                                                                   \
-    bool at_x_boundary = (x + 1) * N0 >= N;                                                                                        \
-    if(!at_y_boundary && !at_x_boundary)                                                                                           \
-    {                                                                                                                              \
-        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                    \
-    }                                                                                                                              \
-    else if(at_y_boundary && !at_x_boundary)                                                                                       \
-    {                                                                                                                              \
-        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                      \
-    }                                                                                                                              \
-    else if(!at_y_boundary && at_x_boundary)                                                                                       \
-    {                                                                                                                              \
-        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                      \
-    }                                                                                                                              \
-    else                                                                                                                           \
-    {                                                                                                                              \
-        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                        \
+#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \
+    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                               \
+    {                                                                                                                                        \
+        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                              \
+    }                                                                                                                                        \
+    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                           \
+    {                                                                                                                                        \
+        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                \
+    }                                                                                                                                        \
+    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                           \
+    {                                                                                                                                        \
+        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                \
+    }                                                                                                                                        \
+    else                                                                                                                                     \
+    {                                                                                                                                        \
+        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                  \
     }
 /** Store a block that can only be partial in x but not y.
  *
@@ -1090,17 +1088,16 @@
  * @param[in] Z                The offset in z-axis direction
  * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
  * @param[in] N                Total number of columns. Used to detect if current block is at the boundary in x.
- * @param[in] x                Global id of current block in x. Used to detect if current block is at the boundary in x.
+ * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
  */
-#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, N, x) \
-    bool at_x_boundary = (x + 1) * N0 >= N;                                                             \
-    if(!at_x_boundary)                                                                                  \
-    {                                                                                                   \
-        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                         \
-    }                                                                                                   \
-    else                                                                                                \
-    {                                                                                                   \
-        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);           \
+#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, N, PARTIAL_COND_X) \
+    if(!(PARTIAL_COND_X))                                                                                            \
+    {                                                                                                        \
+        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                              \
+    }                                                                                                        \
+    else                                                                                                     \
+    {                                                                                                        \
+        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                \
     }
 /** Store a block that can only be partial in y but not x.
  *
@@ -1119,17 +1116,16 @@
  * @param[in] STRIDE_Y         The stride value in y-axis direction
  * @param[in] Z                The offset in z-axis direction
  * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
- * @param[in] y                Global id of current block in y. Used to detect if current block is at the boundary in y.
+ * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
  */
-#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, y) \
-    bool at_y_boundary = y == 0;                                                                     \
-    if(!at_y_boundary)                                                                               \
-    {                                                                                                \
-        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                      \
-    }                                                                                                \
-    else                                                                                             \
-    {                                                                                                \
-        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);        \
+#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
+    if(!(PARTIAL_COND_Y))                                                                                         \
+    {                                                                                                     \
+        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                           \
+    }                                                                                                     \
+    else                                                                                                  \
+    {                                                                                                     \
+        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);             \
     }
 /** @} */ // end of group STORE_BLOCK_PARTIAL
 
@@ -1788,35 +1784,35 @@
  * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
  * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0)
  * @param[in] N                Total number of columns. Used to detect if current block is at the boundary in x.
- * @param[in] y                Global id of current block in y. Used to detect if current block is at the boundary in y.
- * @param[in] x                Global id of current block in x. Used to detect if current block is at the boundary in x.
+ * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
+ * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
  * @{
  */
 #if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 // Case1: No partial blocks in either x or y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x) \
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \
     STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 
 #elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
 // Case2: Partial blocks in y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x) \
-    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, y)
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \
+    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
 
 #elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
 // Case3: Partial blocks in x
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x) \
-    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, N, x)
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \
+    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, N, PARTIAL_COND_X)
 
 #else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 // Case4: Partial blocks in both x and y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x) \
-    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x)
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \
+    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X)
 
 #endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 
 #else // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
 
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x) \
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \
     STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 
 #endif    // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
@@ -1845,4 +1841,4 @@
 #define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
     ((uint)(y * M0))
 #endif    // defined(PARTIAL_STORE_M0)
-/** @} */ // end of group COMPUTE_M0_START_ROW
\ No newline at end of file
+/** @} */ // end of group COMPUTE_M0_START_ROW
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
index 5a46a1e013..8f20de1ea1 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
@@ -159,23 +159,18 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
-    // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
-    // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
-    const unsigned int m          = gemm_info.m;
-    const unsigned int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
-
     win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
     win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     AccessWindowStatic input0_access(input0, 0, 0,
-                                     ceil_to_multiple(input0->dimension(0), num_elems_processed_per_iteration_y),
+                                     input0->dimension(0),
                                      input0->dimension(1));
     AccessWindowStatic input1_access(input1, 0, 0,
-                                     ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
+                                     input1->dimension(0),
                                      input1->dimension(1));
     AccessWindowStatic output_access(output, 0, 0,
-                                     ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
-                                     output->dimension(1) + bottom_pad);
+                                     output->dimension(0),
+                                     output->dimension(1));
 
     if(input2 != nullptr)
     {
@@ -256,6 +251,12 @@ void CLGEMMMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     const bool     enable_mixed_precision = gemm_info.fp_mixed_precision;
     const DataType data_type              = input0->info()->data_type();
 
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1);
+
+    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
+    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
+
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
@@ -286,6 +287,8 @@ void CLGEMMMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi
     build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
     build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
     build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
 
     std::string kernel_name("gemm_mm_reshaped_");
     kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
index e65726b234..cf77c70bfa 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
@@ -162,7 +162,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
                                      input0->dimension(0),
                                      input0->dimension(1));
     AccessWindowStatic input1_access(input1, 0, 0,
-                                     ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
+                                     input1->dimension(0),
                                      input1->dimension(1));
     AccessWindowStatic output_access(output, 0, 0,
                                      output->dimension(0),
diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
index 309a967abc..d7853f3ea7 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
@@ -170,11 +170,101 @@ const auto broadcast_bias_values = framework::dataset::make("broadcast_bias", {
 
 /** LHS transposed values */
 const auto lhs_transpose_values = framework::dataset::make("lhs_transpose", { false, true } );
+
+/** Zero padding test */
+bool validate_zero_padding(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value,
+                            unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, unsigned int h0_value,
+                            bool i_value_rhs, bool t_value_rhs, bool export_to_cl_image, bool broadcast_bias, unsigned int depth_output_gemm3d, const ActivationLayerInfo &act_info,
+                            DataType dt_input0, DataType dt_input1, DataType dt_input2, DataType dt_output, float alpha, float beta)
+{
+    const unsigned int M = m_value;
+    const unsigned int N = n_value;
+    const unsigned int K = k_value;
+
+    GEMMLHSMatrixInfo lhs_info;
+    lhs_info.m0         = m0_value;
+    lhs_info.k0         = k0_value;
+
+    GEMMRHSMatrixInfo rhs_info;
+    rhs_info.n0         = n0_value;
+    rhs_info.k0         = k0_value;
+    rhs_info.h0         = h0_value;
+    rhs_info.interleave = i_value_rhs;
+    rhs_info.transpose  = t_value_rhs;
+    rhs_info.export_to_cl_image = export_to_cl_image;
+
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = M;
+    kernel_info.n                       = N;
+    kernel_info.k                       = K;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = false;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = act_info;
+
+    const TensorShape lhs_shape(K, M, b_value);
+    const TensorShape rhs_shape(N, K, b_value);
+    const TensorShape lhs_shape_reshaped = compute_lhs_reshaped_shape(TensorInfo(lhs_shape, 1, dt_input0),
+                                                                      lhs_info);
+    const TensorShape rhs_shape_reshaped = compute_rhs_reshaped_shape(TensorInfo(rhs_shape, 1, dt_input1),
+                                                                      rhs_info);
+
+    const TensorShape dst_shape = compute_mm_shape(TensorInfo(lhs_shape_reshaped, 1, dt_input0),
+                                                   TensorInfo(rhs_shape_reshaped, 1, dt_input1),
+                                                   kernel_info);
+
+    const TensorShape bias_shape(N,
+                                 M, // Correct calculation should be: broadcast_bias? 1 : M, it's wrong here on purpose just for validation test
+                                 broadcast_bias? 1 : b_value);
+
+    // Create tensors
+    CLTensor lhs_reshaped  = create_tensor<CLTensor>(lhs_shape_reshaped, dt_input0);
+    CLTensor rhs_reshaped  = create_tensor<CLTensor>(rhs_shape_reshaped, dt_input1);
+    CLTensor bias = create_tensor<CLTensor>(bias_shape, dt_input2);
+    CLTensor dst  = create_tensor<CLTensor>(dst_shape, dt_output);
+
+    ARM_COMPUTE_EXPECT(lhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(rhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Validate zero-padding
+    CLGEMMMatrixMultiplyReshaped gemm;
+
+    gemm.configure(&lhs_reshaped, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info);
+
+    // Padding can be added along rhs and bias's X/Y dimension
+    return dst.info()->padding().empty() && lhs_reshaped.info()->padding().empty();
+}
 } // namespace
 
 TEST_SUITE(CL)
 TEST_SUITE(GEMMMatrixMultiplyReshaped)
 
+/** Validate zero padding tests
+ *
+ * A series of validation tests to check the zero padding requirement
+ *
+ * Checks performed in order:
+ *     - No partial blocks in both x and y dimensions
+ *     - Partial blocks in x dimension
+ *     - Partial blocks in y dimension
+ *     - Partial blocks in both x and y dimensions
+ *     - Special case: partial_n0 == 9 (vstore1 should be invoked instead of vstore_partial_1)
+ */
+DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(zip(zip(
+framework::dataset::make("M",                   { 24, 64, 101,   1, 103 }),
+framework::dataset::make("N",                   { 48, 29,  16, 121,  41 })),
+framework::dataset::make("M0",                  {  4,  8,   4,   2,   4 })),
+framework::dataset::make("N0",                  {  4,  4,  16,   2,  16 })),
+m_value, n_value, m0_value, n0_value)
+{
+    constexpr DataType dt = DataType::F32;
+
+    bool status = validate_zero_padding(m_value, n_value, 23, 1, m0_value, n0_value, 4, 1, false, false, false, 0, 0, ActivationLayerInfo(), dt, dt, dt, dt, 1.0f, 1.0f);
+    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
+}
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
-- 
cgit v1.2.1