From e8bd2c729546e59aa0adc241976ea91fc6f25b52 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Wed, 11 Jul 2018 15:54:56 +0100
Subject: COMPMID-1384: graph_mobilenet fails for NHWC on OpenCL

Makes GEMM3D account top padding when jumping among planes.

Change-Id: Ia7c16cfa5498de106774ce42cbc4716e9f43195b
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/139612
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
---
 src/core/CL/cl_kernels/gemm.cl                     | 256 ++++++++++-----------
 src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp |   5 +-
 2 files changed, 131 insertions(+), 130 deletions(-)

diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index f75161ca0a..5a6efe64b9 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -188,7 +188,7 @@ __kernel void gemm_interleave4x4(TENSOR3D_DECLARATION(src),
  * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
  * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  pad_bottom                         Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  cross_plane_pad                    Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
  */
 __kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
                                                  IMAGE_DECLARATION(src1),
@@ -198,7 +198,7 @@ __kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
                                                  uint dst_stride_z
 #if defined(REINTERPRET_OUTPUT_AS_3D)
                                                  ,
-                                                 uint pad_bottom
+                                                 uint cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
                                                 )
 {
@@ -286,26 +286,26 @@ __kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
 
 #if defined(REINTERPRET_OUTPUT_AS_3D)
     // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible bottom paddings
+    // in order to take into account the presence of possible cross plane paddings
     //
-    //  |             |
-    //  |    plane0   |
-    //  |             |
-    //  |_____________|
-    //  |*************|
-    //  |  pad_bottom |
-    //  |*************|
-    //  |             |
-    //  |    plane1   |
-    //  |             |
-    //  |_____________|
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
 
     // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
     uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
     zout       = min(DEPTH_GEMM3D - 1, zout);
 
-    // Add offset due to the bottom paddings
-    zout *= (pad_bottom * dst_stride_y);
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
@@ -366,7 +366,7 @@ __kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
  * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
  * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  pad_bottom                         Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  cross_plane_pad                    Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
  */
 __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
                                                          IMAGE_DECLARATION(src1),
@@ -376,7 +376,7 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
                                                          uint dst_stride_z
 #if defined(REINTERPRET_OUTPUT_AS_3D)
                                                          ,
-                                                         uint pad_bottom
+                                                         uint cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
                                                         )
 {
@@ -596,26 +596,26 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
 
 #if defined(REINTERPRET_OUTPUT_AS_3D)
     // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible bottom paddings
+    // in order to take into account the presence of possible cross plane paddings
     //
-    //  |             |
-    //  |    plane0   |
-    //  |             |
-    //  |_____________|
-    //  |*************|
-    //  |  pad_bottom |
-    //  |*************|
-    //  |             |
-    //  |    plane1   |
-    //  |             |
-    //  |_____________|
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
 
     // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
     uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
     zout       = min(DEPTH_GEMM3D - 1, zout);
 
-    // Add offset due to the bottom paddings
-    zout *= (pad_bottom * dst_stride_y);
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
@@ -679,7 +679,7 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
  * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
  * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  pad_bottom                         Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  cross_plane_pad                    Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
  */
 __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
                                                  IMAGE_DECLARATION(src1),
@@ -689,7 +689,7 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
                                                  uint dst_stride_z
 #if defined(REINTERPRET_OUTPUT_AS_3D)
                                                  ,
-                                                 uint pad_bottom
+                                                 uint cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
                                                 )
 {
@@ -777,26 +777,26 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
 
 #if defined(REINTERPRET_OUTPUT_AS_3D)
     // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible bottom paddings
+    // in order to take into account the presence of possible cross plane paddings
     //
-    //  |             |
-    //  |    plane0   |
-    //  |             |
-    //  |_____________|
-    //  |*************|
-    //  |  pad_bottom |
-    //  |*************|
-    //  |             |
-    //  |    plane1   |
-    //  |             |
-    //  |_____________|
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
 
     // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
     uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
     zout       = min(DEPTH_GEMM3D - 1, zout);
 
-    // Add offset due to the bottom paddings
-    zout *= (pad_bottom * dst_stride_y);
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
@@ -853,7 +853,7 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
  * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
  * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  pad_bottom                         Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  cross_plane_pad                    Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
  */
 __kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),
                                                          IMAGE_DECLARATION(src1),
@@ -863,7 +863,7 @@ __kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0)
                                                          uint dst_stride_z
 #if defined(REINTERPRET_OUTPUT_AS_3D)
                                                          ,
-                                                         uint pad_bottom
+                                                         uint cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
                                                         )
 {
@@ -1033,26 +1033,26 @@ __kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0)
 
 #if defined(REINTERPRET_OUTPUT_AS_3D)
     // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible bottom paddings
+    // in order to take into account the presence of possible cross plane paddings
     //
-    //  |             |
-    //  |    plane0   |
-    //  |             |
-    //  |_____________|
-    //  |*************|
-    //  |  pad_bottom |
-    //  |*************|
-    //  |             |
-    //  |    plane1   |
-    //  |             |
-    //  |_____________|
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
 
     // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
     uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
     zout       = min(DEPTH_GEMM3D - 1, zout);
 
-    // Add offset due to the bottom paddings
-    zout *= (pad_bottom * dst_stride_y);
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
@@ -1122,7 +1122,7 @@ __kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0)
  * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
  * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  pad_bottom                         Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  cross_plane_pad                    Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
  */
 __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
                                      IMAGE_DECLARATION(src1),
@@ -1132,7 +1132,7 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
                                      uint dst_stride_z
 #if defined(REINTERPRET_OUTPUT_AS_3D)
                                      ,
-                                     uint pad_bottom
+                                     uint cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
                                     )
 {
@@ -1261,26 +1261,26 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
 
 #if defined(REINTERPRET_OUTPUT_AS_3D)
     // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible bottom paddings
+    // in order to take into account the presence of possible cross plane paddings
     //
-    //  |             |
-    //  |    plane0   |
-    //  |             |
-    //  |_____________|
-    //  |*************|
-    //  |  pad_bottom |
-    //  |*************|
-    //  |             |
-    //  |    plane1   |
-    //  |             |
-    //  |_____________|
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
 
     // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
     uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
     zout       = min(DEPTH_GEMM3D - 1, zout);
 
-    // Add offset due to the bottom paddings
-    zout *= (pad_bottom * dst_stride_y);
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
@@ -1362,7 +1362,7 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
  * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
  * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  pad_bottom                         Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  cross_plane_pad                    Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
  */
 __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
                                                  IMAGE_DECLARATION(src1),
@@ -1372,7 +1372,7 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
                                                  uint dst_stride_z
 #if defined(REINTERPRET_OUTPUT_AS_3D)
                                                  ,
-                                                 uint pad_bottom
+                                                 uint cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
                                                 )
 {
@@ -1657,26 +1657,26 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
 
 #if defined(REINTERPRET_OUTPUT_AS_3D)
     // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible bottom paddings
+    // in order to take into account the presence of possible cross plane paddings
     //
-    //  |             |
-    //  |    plane0   |
-    //  |             |
-    //  |_____________|
-    //  |*************|
-    //  |  pad_bottom |
-    //  |*************|
-    //  |             |
-    //  |    plane1   |
-    //  |             |
-    //  |_____________|
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
 
     // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
     uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
     zout       = min(DEPTH_GEMM3D - 1, zout);
 
-    // Add offset due to the bottom paddings
-    zout *= (pad_bottom * dst_stride_y);
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
@@ -1750,7 +1750,7 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
  * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
  * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  pad_bottom                         Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  cross_plane_pad                    Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
  */
 __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
                                                       IMAGE_DECLARATION(src1),
@@ -1760,7 +1760,7 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
                                                       uint dst_stride_z
 #if defined(REINTERPRET_OUTPUT_AS_3D)
                                                       ,
-                                                      uint pad_bottom
+                                                      uint cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
                                                      )
 {
@@ -1975,26 +1975,26 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
 
 #if defined(REINTERPRET_OUTPUT_AS_3D)
     // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible bottom paddings
+    // in order to take into account the presence of possible cross plane paddings
     //
-    //  |             |
-    //  |    plane0   |
-    //  |             |
-    //  |_____________|
-    //  |*************|
-    //  |  pad_bottom |
-    //  |*************|
-    //  |             |
-    //  |    plane1   |
-    //  |             |
-    //  |_____________|
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
 
     // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
     uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
     zout       = min(DEPTH_GEMM3D - 1, zout);
 
-    // Add offset due to the bottom paddings
-    zout *= (pad_bottom * dst_stride_y);
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
@@ -2068,7 +2068,7 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
  * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
  * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
  * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  pad_bottom                         Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  cross_plane_pad                    Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
  */
 __kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
                                                  IMAGE_DECLARATION(src1),
@@ -2078,7 +2078,7 @@ __kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
                                                  uint dst_stride_z
 #if defined(REINTERPRET_OUTPUT_AS_3D)
                                                  ,
-                                                 uint pad_bottom
+                                                 uint cross_plane_pad
 #endif // REINTERPRET_OUTPUT_AS_3D
                                                 )
 {
@@ -2241,26 +2241,26 @@ __kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
 
 #if defined(REINTERPRET_OUTPUT_AS_3D)
     // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible bottom paddings
+    // in order to take into account the presence of possible cross plane paddings
     //
-    //  |             |
-    //  |    plane0   |
-    //  |             |
-    //  |_____________|
-    //  |*************|
-    //  |  pad_bottom |
-    //  |*************|
-    //  |             |
-    //  |    plane1   |
-    //  |             |
-    //  |_____________|
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
 
     // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
     uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
     zout       = min(DEPTH_GEMM3D - 1, zout);
 
-    // Add offset due to the bottom paddings
-    zout *= (pad_bottom * dst_stride_y);
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
 
     // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
     // multiply dst_stride_z by DEPTH_GEMM3D
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 814cbb631f..0c629af788 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -358,8 +358,9 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que
     if(_is_gemm3d)
     {
         // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-        const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(_output->info()->padding().bottom));
+        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
+        const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
+        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
 
     do
-- 
cgit v1.2.1