COMPMID-1899: Fix NaN issue in CLGEMMMatrixMultiplyReshapedKernel

Change-Id: Ide950b46c4d41de230c272c7044a03f4f9f237ed Reviewed-on: https://review.mlplatform.org/548 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
author: Gian Marco Iodice <gianmarco.iodice@arm.com> 2019-01-21 17:14:31 +0000
committer: Gian Marco Iodice <gianmarco.iodice@arm.com> 2019-01-21 18:52:34 +0000
commit: b87b95e13cce28523e25d6a50af3147edfb43357 (patch)
tree: 7261a482a140c4fe082caa1c172ead302be23240
parent: a0a0e29f635de08092c2325f8f049ffb286aabaf (diff)
download: ComputeLibrary-b87b95e13cce28523e25d6a50af3147edfb43357.tar.gz
3 files changed, 43 insertions, 3 deletions
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index 3a76b74b2f..200f919439 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -24,12 +24,30 @@
 #include "helpers.h"
 #include "repeat.h"
 
-#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE)
+#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
+#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1)
+#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2)
+#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3)
+#define INC8 (VEC_DATA_TYPE(uint, 8))(0, 1, 2, 3, 4, 5, 6, 7)
+#define INC16 (VEC_DATA_TYPE(uint, 16))(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+#define CONCAT_INC(K0) INC##K0
+#define INC(K0) CONCAT_INC(K0)
+
+#if(SRC_WIDTH % K0)
+#define BOUNDARY_CONDITION_X(x, a)                                                                                                                   \
+    ({                                                                                                                                               \
+        a = select(0, a, CONVERT(((x * (VEC_DATA_TYPE(uint, K0))K0 + INC(K0)) < (VEC_DATA_TYPE(uint, K0))SRC_WIDTH), VEC_DATA_TYPE(DATA_TYPE, K0))); \
+    })
+#else // (SRC_WIDTH % K0)
+#define BOUNDARY_CONDITION_X(x, a) \
+    ({})
+#endif // (SRC_WIDTH % K0)
 
 /** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in
  *  the output matrix unrolling the values.
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (i.e. -DSRC_WIDTH=16)
  * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (i.e. -DM0=2, -DK0=2).
  * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (i.e. -DV0=2)
  * @note Only the following values for M0, K0 and V0 are supported:
@@ -179,33 +197,41 @@ __kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),
     // Load values from the LHS matrix
     VEC_DATA_TYPE(DATA_TYPE, K0)
     a0 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y + zin0));
+    BOUNDARY_CONDITION_X(x, a0);
 #if M0 > 1
     VEC_DATA_TYPE(DATA_TYPE, K0)
     a1 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y + zin1));
+    BOUNDARY_CONDITION_X(x, a1);
 #endif // M0 > 1
 #if M0 > 2
     VEC_DATA_TYPE(DATA_TYPE, K0)
     a2 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y + zin2));
+    BOUNDARY_CONDITION_X(x, a2);
 #endif // M0 > 2
 #if M0 > 3
     VEC_DATA_TYPE(DATA_TYPE, K0)
     a3 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y + zin3));
+    BOUNDARY_CONDITION_X(x, a3);
 #endif // M0 > 3
 #if M0 > 4
     VEC_DATA_TYPE(DATA_TYPE, K0)
     a4 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y + zin4));
+    BOUNDARY_CONDITION_X(x, a4);
 #endif // M0 > 4
 #if M0 > 5
     VEC_DATA_TYPE(DATA_TYPE, K0)
     a5 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y + zin5));
+    BOUNDARY_CONDITION_X(x, a5);
 #endif // M0 > 5
 #if M0 > 6
     VEC_DATA_TYPE(DATA_TYPE, K0)
     a6 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y + zin6));
+    BOUNDARY_CONDITION_X(x, a6);
 #endif // M0 > 6
 #if M0 > 7
     VEC_DATA_TYPE(DATA_TYPE, K0)
     a7 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y + zin7));
+    BOUNDARY_CONDITION_X(x, a7);
 #endif // M0 > 7
 
     // ---------------------------Store output values ------------------------------
@@ -320,6 +346,7 @@ __kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),
  *  the output matrix unrolling the values.
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (i.e. -DSRC_WIDTH=16)
  * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (i.e. -DM0=2, -DK0=2).
  * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (i.e. -DV0=2)
  * @note Only the following values for M0, K0 and V0 are supported:
@@ -469,33 +496,41 @@ __kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),
     // Load values from the LHS matrix
     VEC_DATA_TYPE(DATA_TYPE, K0)
     a0 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y + zin0));
+    BOUNDARY_CONDITION_X(x, a0);
 #if M0 > 1
     VEC_DATA_TYPE(DATA_TYPE, K0)
     a1 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y + zin1));
+    BOUNDARY_CONDITION_X(x, a1);
 #endif // M0 > 1
 #if M0 > 2
     VEC_DATA_TYPE(DATA_TYPE, K0)
     a2 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y + zin2));
+    BOUNDARY_CONDITION_X(x, a2);
 #endif // M0 > 2
 #if M0 > 3
     VEC_DATA_TYPE(DATA_TYPE, K0)
     a3 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y + zin3));
+    BOUNDARY_CONDITION_X(x, a3);
 #endif // M0 > 3
 #if M0 > 4
     VEC_DATA_TYPE(DATA_TYPE, K0)
     a4 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y + zin4));
+    BOUNDARY_CONDITION_X(x, a4);
 #endif // M0 > 4
 #if M0 > 5
     VEC_DATA_TYPE(DATA_TYPE, K0)
     a5 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y + zin5));
+    BOUNDARY_CONDITION_X(x, a5);
 #endif // M0 > 5
 #if M0 > 6
     VEC_DATA_TYPE(DATA_TYPE, K0)
     a6 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y + zin6));
+    BOUNDARY_CONDITION_X(x, a6);
 #endif // M0 > 6
 #if M0 > 7
     VEC_DATA_TYPE(DATA_TYPE, K0)
     a7 = VLOAD(K0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y + zin7));
+    BOUNDARY_CONDITION_X(x, a7);
 #endif // M0 > 7
 
     // ---------------------------Transpose and store block -----------------------
@@ -529,7 +564,7 @@ __kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),
 #undef OUTPUT_OFFSET_X
 #undef OUTPUT_STEP_X
 }
-#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE)
+#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
 
 #if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
 /** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in
diff --git a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
index ca0ebd63df..242924b1d1 100644
--- a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
@@ -133,6 +133,7 @@ void CLGEMMReshapeLHSMatrixKernel::configure(const ICLTensor *input, ICLTensor *
     build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
     build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
+    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
     build_opts.add_option_if(lhs_info.interleave, "-DINTERLEAVE");
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(1)));
diff --git a/tests/validation/fixtures/GEMMFixture.h b/tests/validation/fixtures/GEMMFixture.h
index 24c9d96611..a6a3b67785 100644
--- a/tests/validation/fixtures/GEMMFixture.h
+++ b/tests/validation/fixtures/GEMMFixture.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -187,6 +187,10 @@ protected:
     {
         std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
         library->fill(tensor, distribution, i);
+
+        // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0)
+        std::uniform_real_distribution<> distribution_inf(std::numeric_limits<float>::infinity(), std::numeric_limits<float>::infinity());
+        library->fill_borders_with_garbage(tensor, distribution_inf, i);
     }
 
     TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, DataType data_type, float alpha)
author	Gian Marco Iodice <gianmarco.iodice@arm.com>	2019-01-21 17:14:31 +0000
committer	Gian Marco Iodice <gianmarco.iodice@arm.com>	2019-01-21 18:52:34 +0000
commit	b87b95e13cce28523e25d6a50af3147edfb43357 (patch)
tree	7261a482a140c4fe082caa1c172ead302be23240
parent	a0a0e29f635de08092c2325f8f049ffb286aabaf (diff)
download	ComputeLibrary-b87b95e13cce28523e25d6a50af3147edfb43357.tar.gz