From 99b1a1cc1bdeaec08d2a8fb5ac5d104502e05570 Mon Sep 17 00:00:00 2001
From: Manuel Bottini <manuel.bottini@arm.com>
Date: Mon, 29 Mar 2021 17:15:21 +0100
Subject: Fix bug on Implicit Padding for CL
 GEMMMatrixMultiplyInterleavedTransposed

Resolves: COMPMID-4342

Change-Id: I468c6d68c0284e4ec76f22037a697fff7bc5638c
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5391
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 src/core/CL/cl_kernels/gemm_v1.cl                  | 21 +++++++++++++--------
 src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp |  2 +-
 2 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'src/core')

diff --git a/src/core/CL/cl_kernels/gemm_v1.cl b/src/core/CL/cl_kernels/gemm_v1.cl
index 5f8b4f694e..a136a1b96b 100644
--- a/src/core/CL/cl_kernels/gemm_v1.cl
+++ b/src/core/CL/cl_kernels/gemm_v1.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,12 +24,13 @@
 #include "gemm_helpers.h"
 #include "repeat.h"
 
-#if defined(M) && defined(N) && defined(K) && defined(H0) && defined(V0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+#if defined(M) && defined(N) && defined(K) && defined(H0) && defined(V0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) && defined(IN1_DIM_X)
 /** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
  *
  * @note The number of rows of destination matrix must be passed at compile time using -DM
  * @note The number of columns of the destination matrix must be passed at compile time using -DN
  * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
+ * @note The number of columns of the reshaped rhs matrix must be passed at compile time using -DIN1_DIM_X
  * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
  * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
  * @note The optional alpha's value need to be passed at compile time using -DALPHA
@@ -118,7 +119,7 @@ __kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
     __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
 
     // Compute end row address for matrix B
-    __global float *src_end_addr_b = src_addr_b + (src1_stride_y / sizeof(float));
+    __global float *src_end_addr_b = src_addr_b + IN1_DIM_X;
 
     src_addr_a += offset_row_a;
     src_addr_b += offset_row_b;
@@ -583,6 +584,7 @@ __kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0)
  * @note The number of rows of destination matrix must be passed at compile time using -DM
  * @note The number of columns of the destination matrix must be passed at compile time using -DN
  * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
+ * @note The number of columns of the reshaped rhs matrix must be passed at compile time using -DIN1_DIM_X
  * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
  * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
  * @note The optional alpha's value need to be passed at compile time using -DALPHA
@@ -671,7 +673,7 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
     __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
 
     // Compute end row address for matrix B
-    __global half *src_end_addr_b = src_addr_b + (src1_stride_y / sizeof(half));
+    __global half *src_end_addr_b = src_addr_b + IN1_DIM_X;
 
     src_addr_a += offset_row_a;
     src_addr_b += offset_row_b;
@@ -807,6 +809,7 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
  * @note The number of rows of destination matrix must be passed at compile time using -DM
  * @note The number of columns of the destination matrix must be passed at compile time using -DN
  * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
+ * @note The number of columns of the reshaped rhs matrix must be passed at compile time using -DIN1_DIM_X
  * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
  * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
  * @note The optional alpha's value need to be passed at compile time using -DALPHA
@@ -895,7 +898,7 @@ __kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
     __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
 
     // Compute end row address for matrix B
-    __global half *src_end_addr_b = src_addr_b + (src1_stride_y / sizeof(half));
+    __global half *src_end_addr_b = src_addr_b + IN1_DIM_X;
 
     src_addr_a += offset_row_a;
     src_addr_b += offset_row_b;
@@ -1337,7 +1340,7 @@ __kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0)
 
 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
 
-#endif // defined(M) && defined(N) && defined(K) && defined(H0) && defined(V0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+#endif // defined(M) && defined(N) && defined(K) && defined(H0) && defined(V0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) && defined(IN1_DIM_X)
 
 #if defined(N) && defined(K) && defined(M0) && defined(N0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
 #if defined(DATA_TYPE)
@@ -2023,7 +2026,8 @@ __kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
 
     // Compute dst address
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                               PARTIAL_STORE_M0) * dst_stride_y);
+                               PARTIAL_STORE_M0)
+                               * dst_stride_y);
 
     uint4 zout = 0;
 
@@ -2427,7 +2431,8 @@ __kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
 
     // Compute dst address
     __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                               PARTIAL_STORE_M0) * dst_stride_y);
+                               PARTIAL_STORE_M0)
+                               * dst_stride_y);
 
     uint4 zout = 0;
 
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 1ef46b5059..479c06330d 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -352,6 +352,7 @@ void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_conte
     build_opts.add_option_if(activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(activation_info.activation())));
     build_opts.add_option_if(activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(activation_info.a()));
     build_opts.add_option_if(activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(activation_info.b()));
+    build_opts.add_option("-DIN1_DIM_X=" + support::cpp11::to_string(input1->info()->dimension(0)));
 
     const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;
 
@@ -424,7 +425,6 @@ void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_conte
             kernel_name = "gemm_mm_floating_point";
         }
     }
-
     // Create kernel
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
-- 
cgit v1.2.1