Extend direct convolution (F32/F16/QASYMM8)

The new function can handle different block sizes (M0, N0) New utility macros have been developed to simplify the work and the future OpenCL kernel development. In particular the work has been done to also consider cases with: - the texture pipe support - dynamic tensor shape support Change-Id: Ife4c64baf07517938bb8ad18e6a5f4579345c40f Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5297 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Giorgio Arena <giorgio.arena@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Gian Marco Iodice <gianmarco.iodice@arm.com> 2021-03-19 11:26:20 +0000
committer: Gian Marco Iodice <gianmarco.iodice@arm.com> 2021-03-23 14:16:34 +0000
commit: 5c9eed82102b29381cfcf36a0e98d3c5239f37e5 (patch)
tree: fdfcbe642f4d7896e477583d74f81e7e30271465 /src
parent: 1efed925da927cc47bff6327c66f252b65c660bc (diff)
download: ComputeLibrary-5c9eed82102b29381cfcf36a0e98d3c5239f37e5.tar.gz
4 files changed, 582 insertions, 483 deletions
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 14d3a2cad5..726efa3575 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -547,7 +547,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
     {
         "convert_fc_weights.cl",
 #include "./cl_kernels/convert_fc_weights.clembed"
-    },    
+    },
     {
         "convolution_layer.cl",
 #include "./cl_kernels/convolution_layer.clembed"
diff --git a/src/core/CL/cl_kernels/direct_convolution.cl b/src/core/CL/cl_kernels/direct_convolution.cl
index 5d2a24e740..1de3737965 100644
--- a/src/core/CL/cl_kernels/direct_convolution.cl
+++ b/src/core/CL/cl_kernels/direct_convolution.cl
@@ -21,375 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "gemm_helpers.h"
-#include "helpers_asymm.h"
-#include "repeat.h"
-
-#if defined(IS_QUANTIZED)
-
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
-#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val));
-#elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
-#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
-#define ARM_DOT(x, y, val)                                \
-    ({                                                    \
-        val += (ACC_DATA_TYPE)x.s0 * (ACC_DATA_TYPE)y.s0; \
-        val += (ACC_DATA_TYPE)x.s1 * (ACC_DATA_TYPE)y.s1; \
-        val += (ACC_DATA_TYPE)x.s2 * (ACC_DATA_TYPE)y.s2; \
-        val += (ACC_DATA_TYPE)x.s3 * (ACC_DATA_TYPE)y.s3; \
-    })
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
-
-#define ARM_DOT1(a, b, c)                                                                                                                                                   \
-    ({                                                                                                                                                                      \
-        ARM_DOT(((VEC_DATA_TYPE(SRC_DATA_TYPE, 4))(a, (VEC_DATA_TYPE(SRC_DATA_TYPE, 3))0)), ((VEC_DATA_TYPE(WEI_DATA_TYPE, 4))(b, (VEC_DATA_TYPE(WEI_DATA_TYPE, 3))0)), c); \
-    })
-#define ARM_DOT2(a, b, c)                                                                                                                                                   \
-    ({                                                                                                                                                                      \
-        ARM_DOT(((VEC_DATA_TYPE(SRC_DATA_TYPE, 4))(a, (VEC_DATA_TYPE(SRC_DATA_TYPE, 2))0)), ((VEC_DATA_TYPE(WEI_DATA_TYPE, 4))(b, (VEC_DATA_TYPE(WEI_DATA_TYPE, 2))0)), c); \
-    })
-#define ARM_DOT3(a, b, c)                                                                                                               \
-    ({                                                                                                                                  \
-        ARM_DOT(((VEC_DATA_TYPE(SRC_DATA_TYPE, 4))(a, (SRC_DATA_TYPE)0)), ((VEC_DATA_TYPE(WEI_DATA_TYPE, 4))(b, (WEI_DATA_TYPE)0)), c); \
-    })
-#define ARM_DOT4(a, b, c) \
-    ({                    \
-        ARM_DOT(a, b, c); \
-    })
-#define ARM_DOT8(a, b, c)            \
-    ({                               \
-        ARM_DOT4((a.lo), (b.lo), c); \
-        ARM_DOT4((a.hi), (b.hi), c); \
-    })
-#define ARM_DOT16(a, b, c)           \
-    ({                               \
-        ARM_DOT8((a.lo), (b.lo), c); \
-        ARM_DOT8((a.hi), (b.hi), c); \
-    })
-
-#define ARM_OFFSET1(a, b, c)                      \
-    ({                                            \
-        c += (ACC_DATA_TYPE)a * (ACC_DATA_TYPE)b; \
-    })
-#define ARM_OFFSET2(a, b, c)                         \
-    ({                                               \
-        c += (ACC_DATA_TYPE)a.s0 * (ACC_DATA_TYPE)b; \
-        c += (ACC_DATA_TYPE)a.s1 * (ACC_DATA_TYPE)b; \
-    })
-#define ARM_OFFSET3(a, b, c)                         \
-    ({                                               \
-        ARM_OFFSET2(a, b, c);                        \
-        c += (ACC_DATA_TYPE)a.s2 * (ACC_DATA_TYPE)b; \
-    })
-#define ARM_OFFSET4(a, b, c)                         \
-    ({                                               \
-        ARM_OFFSET3(a, b, c);                        \
-        c += (ACC_DATA_TYPE)a.s3 * (ACC_DATA_TYPE)b; \
-    })
-#define ARM_OFFSET8(a, b, c)         \
-    ({                               \
-        ARM_OFFSET4((a.lo), (b), c); \
-        ARM_OFFSET4((a.hi), (b), c); \
-    })
-#define ARM_OFFSET16(a, b, c)        \
-    ({                               \
-        ARM_OFFSET8((a.lo), (b), c); \
-        ARM_OFFSET8((a.hi), (b), c); \
-    })
-
-#if N0 == 1
-#define ARM_OFFSET_K0XN0(k0, a, b, a_offset, b_offset, c) \
-    ({                                                    \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c));                           \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##0), (a_offset), (c));                        \
-    })
-#elif N0 == 2 // N) == 3
-#define ARM_OFFSET_K0XN0(k0, a, b, a_offset, b_offset, c) \
-    ({                                                    \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s0));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##0), (a_offset), (c.s0));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s1));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##1), (a_offset), (c.s1));                     \
-    })
-#elif N0 == 3 // N0 == 3
-#define ARM_OFFSET_K0XN0(k0, a, b, a_offset, b_offset, c) \
-    ({                                                    \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s0));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##0), (a_offset), (c.s0));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s1));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##1), (a_offset), (c.s1));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s2));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##2), (a_offset), (c.s2));                     \
-    })
-#elif N0 == 4 // N0 == 4
-#define ARM_OFFSET_K0XN0(k0, a, b, a_offset, b_offset, c) \
-    ({                                                    \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s0));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##0), (a_offset), (c.s0));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s1));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##1), (a_offset), (c.s1));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s2));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##2), (a_offset), (c.s2));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s3));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##3), (a_offset), (c.s3));                     \
-    })
-#elif N0 == 8 // N0 == 8
-#define ARM_OFFSET_K0XN0(k0, a, b, a_offset, b_offset, c) \
-    ({                                                    \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s0));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##0), (a_offset), (c.s0));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s1));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##1), (a_offset), (c.s1));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s2));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##2), (a_offset), (c.s2));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s3));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##3), (a_offset), (c.s3));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s4));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##4), (a_offset), (c.s4));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s5));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##5), (a_offset), (c.s5));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s6));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##6), (a_offset), (c.s6));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s7));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##7), (a_offset), (c.s7));                     \
-    })
-#elif N0 == 16 // N0 == 16
-#define ARM_OFFSET_K0XN0(k0, a, b, a_offset, b_offset, c) \
-    ({                                                    \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s0));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##0), (a_offset), (c.s0));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s1));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##1), (a_offset), (c.s1));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s2));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##2), (a_offset), (c.s2));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s3));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##3), (a_offset), (c.s3));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s4));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##4), (a_offset), (c.s4));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s5));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##5), (a_offset), (c.s5));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s6));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##6), (a_offset), (c.s6));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s7));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##7), (a_offset), (c.s7));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s8));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##8), (a_offset), (c.s8));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.s9));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##9), (a_offset), (c.s9));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.sA));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##A), (a_offset), (c.sA));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.sB));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##B), (a_offset), (c.sB));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.sC));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##C), (a_offset), (c.sC));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.sD));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##D), (a_offset), (c.sD));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.sE));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##E), (a_offset), (c.sE));                     \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((a), (b_offset), (c.sF));                        \
-        CONCAT(ARM_OFFSET, k0)                            \
-        ((b##F), (a_offset), (c.sF));                     \
-    })
-#else // N0 not supported
-#error "N0 value not supported"
-#endif // N0 conditions
-#else  // defined(IS_QUANTIZED)
-
-#define ARM_DOT1(a, b, c)                         \
-    ({                                            \
-        c += (ACC_DATA_TYPE)a * (ACC_DATA_TYPE)b; \
-    })
-#define ARM_DOT2(a, b, c)                               \
-    ({                                                  \
-        c += (ACC_DATA_TYPE)a.s0 * (ACC_DATA_TYPE)b.s0; \
-        c += (ACC_DATA_TYPE)a.s1 * (ACC_DATA_TYPE)b.s1; \
-    })
-#define ARM_DOT3(a, b, c)                               \
-    ({                                                  \
-        ARM_DOT2(a, b, c);                              \
-        c += (ACC_DATA_TYPE)a.s2 * (ACC_DATA_TYPE)b.s2; \
-    })
-#define ARM_DOT4(a, b, c)                               \
-    ({                                                  \
-        ARM_DOT3(a, b, c);                              \
-        c += (ACC_DATA_TYPE)a.s3 * (ACC_DATA_TYPE)b.s3; \
-    })
-#define ARM_DOT8(a, b, c)            \
-    ({                               \
-        ARM_DOT4((a.lo), (b.lo), c); \
-        ARM_DOT4((a.hi), (b.hi), c); \
-    })
-#define ARM_DOT16(a, b, c)           \
-    ({                               \
-        ARM_DOT8((a.lo), (b.lo), c); \
-        ARM_DOT8((a.hi), (b.hi), c); \
-    })
-#endif // defined(IS_QUANTIZED)
 
-#if N0 == 1
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c));        \
-    })
-#elif N0 == 2 // N) == 3
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-    })
-#elif N0 == 3 // N0 == 3
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-    })
-#elif N0 == 4 // N0 == 4
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##3), (c.s3));     \
-    })
-#elif N0 == 8 // N0 == 8
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##3), (c.s3));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##4), (c.s4));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##5), (c.s5));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##6), (c.s6));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##7), (c.s7));     \
-    })
-#elif N0 == 16 // N0 == 16
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##3), (c.s3));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##4), (c.s4));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##5), (c.s5));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##6), (c.s6));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##7), (c.s7));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##8), (c.s8));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##9), (c.s9));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##A), (c.sA));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##B), (c.sB));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##C), (c.sC));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##D), (c.sD));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##E), (c.sE));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##F), (c.sF));     \
-    })
-#else // N0 not supported
-#error "N0 value not supported"
-#endif // N0 conditions
+#include "helpers.h"
+#include "helpers_asymm.h"
+#include "tile_helpers.h"
 
+//! @cond Doxygen_Suppress
 /** OpenCL kernel to compute the direct convolution.
  *
  * @note Data layout supported: NHWC
@@ -403,6 +40,9 @@
  * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
  * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
  * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDDST_CHANNELS=64)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
  * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
  * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=float)
  * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
@@ -410,12 +50,12 @@
  * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
  * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
  * @note The number of K0 inner accumulations must be passed at compile time using -DK0 (e.g. -DK0=2)
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note The zero value must be passed at compile time using -DZERO_VALUE (e.g. -DZERO_VALUE=0)
  * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 1
+ *  - M0 = 1, 2, 3, 4, 5, .... n
  *  - N0 = 2, 3, 4, 8, 16
- *  - K0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE)
  *
  *@note In case of QASYMM8/QASYMM8_SIGNED, the following extra information must be passed at compile time:
  * - -DIS_QUANTIZED
@@ -426,13 +66,15 @@
  * - The weights offset e.g. -DWEI_OFFSET=4
  * - The quantized zero value e.g. -DZERO_VALUE=4
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32/QASYMM8
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
  * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
@@ -441,6 +83,8 @@
  * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
  * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
  * @param[in]  wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
  * @param[in]  wei_stride_x                      Stride of the weights tensor in X dimension (in bytes)
@@ -449,156 +93,184 @@
  * @param[in]  wei_step_y                        wei_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
  * @param[in]  wei_step_z                        wei_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
+ * @param[in]  wei_step_w                        wei_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the bias matrix
  * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr (if F32/F16) or S32 (if QASYMM8/QASYMM8_SIGNED)
  * @param[in]  bia_stride_x                      (Optional) Stride of the bias tensor in X dimension (in bytes)
  * @param[in]  bia_step_x                        (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
  */
+//! @endcond
 __kernel void direct_convolution_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(wei),
+    TENSOR4D(src, SRC_TENSOR_TYPE),
+    TENSOR4D(dst, DST_TENSOR_TYPE),
+    TENSOR4D(wei, WEI_TENSOR_TYPE),
 #if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bia),
+    VECTOR_DECLARATION(bia)
 #endif // defined(HAS_BIAS)
-    unsigned int wei_stride_w)
+)
 {
-#if M0 != 1
-#error "M0: Only supported 1"
-#endif // M0 != 1
-
-    const int cout = max((int)(get_global_id(0) * N0 - (N0 - PARTIAL_STORE_N0) % N0), 0); // input channels
-    const int mout = get_global_id(1);                                                    // width x height
-    const int zout = get_global_id(2);                                                    // batch size index
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH WEI_WIDTH
+#define _IWEI_HEIGHT WEI_HEIGHT
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _ISRC_CHANNELS SRC_CHANNELS
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _IDST_CHANNELS DST_CHANNELS
+
+    // If quantized, the output tile has to be quantized first before being stored to global memory
+#if defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE cq
+#else // defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE c
+#endif // defined(IS_QUANTIZED)
 
-    REPEAT_VAR_INIT_TO_CONST(16, int, zero, 0);
-    REPEAT_VAR_INIT_TO_CONST(M0, int, xi, 0);
-    REPEAT_VAR_INIT_TO_CONST(M0, int, yi, 0);
+    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH x HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0);           // BATCH SIZE IDX
 
-#define LINEAR_2_COORDS(i)                            \
-    xi##i = ((mout * M0 + i) % DST_WIDTH) * STRIDE_X; \
-    yi##i = ((mout * M0 + i) / DST_WIDTH) * STRIDE_Y; \
-    xi##i -= PAD_LEFT;                                \
-    yi##i -= PAD_TOP;
+    // .v    = access the whole vector (OpenCL vector)
+    // .s[x] = access the vector element at position x (scalar access)
+    TILE(int, M0, 1, xi) = { { 0 } };
+    TILE(int, M0, 1, yi) = { { 0 } };
 
     // Convert the linear index to coordinate
-    LINEAR_2_COORDS(0);
-
-#undef LINEAR_2_COORDS
+    LOOP_UNROLLING(int, i, 0, M0, 1)
+    {
+        xi[i].v = ((mout + i) % _IDST_WIDTH) * STRIDE_X;
+        yi[i].v = ((mout + i) / _IDST_WIDTH) * STRIDE_Y;
+        xi[i].v -= PAD_LEFT;
+        yi[i].v -= PAD_TOP;
+    }
 
-    uint src_offset = src_offset_first_element_in_bytes + zout * src_stride_y * (SRC_WIDTH * SRC_HEIGHT);
-    uint wei_offset = wei_offset_first_element_in_bytes + cout * wei_stride_w;
+    uint wei_x = 0;
 
     // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0);
+    TILE(ACC_DATA_TYPE, M0, N0, c) = { { 0 } };
 
-    for(int i = 0; i < (WEI_WIDTH * WEI_HEIGHT); ++i)
+    for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
     {
-        int xk = i % WEI_WIDTH;
-        int yk = i / WEI_WIDTH;
-
-        REPEAT_VAR_INIT_TO_CONST(M0, int, mi_valid_row, 0);
-        REPEAT_VAR_INIT_TO_CONST(M0, int, mi_mask, 0);
+        uint src_x = 0;
+        int  xk    = i % _IWEI_WIDTH;
+        int  yk    = i / _IWEI_WIDTH;
 
-        // Calculate the input row to read from source tensor
-#define MI_INIT(i)                                                                                                  \
-    mi_valid_row##i = max(min(xi##i + xk, SRC_WIDTH - 1), 0) + max(min(yi##i + yk, SRC_HEIGHT - 1), 0) * SRC_WIDTH; \
-    mi_mask##i      = (xi##i + xk) >= 0 && (xi##i + xk) < SRC_WIDTH && (yi##i + yk) >= 0 && (yi##i + yk) < SRC_HEIGHT;
+        TILE(int, M0, 1, src_indirect_y)    = { { 0 } };
+        TILE(int, M0, 1, src_indirect_mask) = { { 0 } };
 
-        MI_INIT(0);
-
-#undef MI_INIT
+        // Calculate the source indirect Y and the source indirect mask
+        // Since the indirect Y is clamped when out-of-bound, the mask is used to
+        // force to zero the out-of-bound values
+        LOOP_UNROLLING(int, i, 0, M0, 1)
+        {
+            src_indirect_y[i].v = (CLAMP(xi[i].v + xk, 0, (int)_ISRC_WIDTH - 1) + CLAMP(yi[i].v + yk, 0, (int)_ISRC_HEIGHT - 1) * _ISRC_WIDTH);
+            src_indirect_y[i].v += bout * (int)_ISRC_WIDTH * (int)_ISRC_HEIGHT;
+            src_indirect_mask[i].v = ((xi[i].v + xk) >= 0 && (xi[i].v + xk) < (int)_ISRC_WIDTH && (yi[i].v + yk) >= 0 && (yi[i].v + yk) < (int)_ISRC_HEIGHT);
+        }
 
         int k = 0;
-        for(; k <= (SRC_CHANNELS - K0); k += K0)
+        for(; k <= (_ISRC_CHANNELS - K0); k += K0)
         {
-            // Load values from src tensor
-            LOAD_BLOCK_INDIRECT(M0, K0, SRC_DATA_TYPE, a, src_ptr, src_offset + k * sizeof(SRC_DATA_TYPE), src_stride_y, mi_valid_row, mi_mask);
+            TILE(SRC_DATA_TYPE, M0, K0, a);
+            TILE(WEI_DATA_TYPE, N0, K0, b);
 
-            // Load values from weights tensor
-            LOAD_BLOCK(N0, K0, WEI_DATA_TYPE, b, wei_ptr, wei_offset, wei_stride_w, zero);
+            // Load tile from the src tensor
+            T_LOAD_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, src_x, src_stride_y, src_indirect_y, a);
 
-#if defined(IS_QUANTIZED)
-#define TENSOR_DOT(K0, i)                                                                                      \
-    if(mi_mask##i != 0)                                                                                        \
-    {                                                                                                          \
-        ARM_DOT_K0XN0(K0, a##i, b, c##i);                                                                      \
-        ARM_OFFSET_K0XN0(K0, a##i, b, SRC_OFFSET, WEI_OFFSET, c##i);                                           \
-    }                                                                                                          \
-    else                                                                                                       \
-    {                                                                                                          \
-        ARM_DOT_K0XN0(K0, ((VEC_DATA_TYPE(SRC_DATA_TYPE, K0))ZERO_VALUE), b, c##i);                            \
-        ARM_OFFSET_K0XN0(K0, ((VEC_DATA_TYPE(SRC_DATA_TYPE, K0))ZERO_VALUE), b, SRC_OFFSET, WEI_OFFSET, c##i); \
-    }
-#else // defined(IS_QUANTIZED)
-#define TENSOR_DOT(K0, i) \
-    ARM_DOT_K0XN0(K0, a##i, b, c##i);
-#endif // defined(IS_QUANTIZED)
+            // Load tile from the weights tensor
+            T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, wei_x, cout, wei_stride_w, b);
 
-            TENSOR_DOT(K0, 0);
+            // Fill with zero the out-of-bound rows
+            T_ROWSET_MASK(SRC_DATA_TYPE, M0, K0, ZERO_VALUE, a, src_indirect_mask);
 
-            wei_offset += K0 * sizeof(WEI_DATA_TYPE);
+            // Compute the matrix multiplication between two tiles
+            T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
+
+            // Apply the offset correction (correction usually needed for asymmetric quantized computation)
+            // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c);
+
+            src_x += K0;
+            wei_x += K0;
         }
 
-#if(SRC_CHANNELS % K0) != 0
+        // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS
+        // This #if directive should be removed in case of dynamic tensor support
+#if((SRC_CHANNELS % K0) != 0)
         // Left-over accumulations
-        for(; k < SRC_CHANNELS; ++k)
+        for(; k < _ISRC_CHANNELS; ++k)
         {
-            // Load values from src tensor
-            LOAD_BLOCK_INDIRECT(M0, 1, SRC_DATA_TYPE, a, src_ptr, src_offset + k * sizeof(SRC_DATA_TYPE), src_stride_y, mi_valid_row, mi_mask);
+            TILE(SRC_DATA_TYPE, M0, 1, a);
+            TILE(WEI_DATA_TYPE, N0, 1, b);
 
-            // Load values from weights tensor
-            LOAD_BLOCK(N0, 1, WEI_DATA_TYPE, b, wei_ptr, wei_offset, wei_stride_w, zero);
+            // Load tile from the src tensor
+            T_LOAD_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, src_x, src_stride_y, src_indirect_y, a);
 
-            TENSOR_DOT(1, 0);
+            // Load tile from the weights tensor
+            T_LOAD(WEI_DATA_TYPE, N0, 1, WEI_TENSOR_TYPE, wei, wei_x, cout, wei_stride_w, b);
 
-#undef TENSOR_DOT
+            // Fill with zero the out-of-bound rows
+            T_ROWSET_MASK(SRC_DATA_TYPE, M0, 1, ZERO_VALUE, a, src_indirect_mask);
 
-            wei_offset += sizeof(WEI_DATA_TYPE);
-        }
-#endif // (SRC_CHANNELS % K0) != 0
+            // Compute the matrix multiplication between two tiles
+            T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
 
-        c0 += (SRC_CHANNELS * SRC_OFFSET * WEI_OFFSET);
-    }
+            // Apply the offset correction (operation usually needed for asymmetric quantized computation)
+            // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
 
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (cout * sizeof(DST_DATA_TYPE)) + (mout * M0 * dst_stride_y);
+            ++src_x;
+            ++wei_x;
+        }
+#endif // ((SRC_CHANNELS % K0) != 0)
+    }
 
-    // Batched direct convolution
-    dst_addr += zout * dst_stride_y * (DST_WIDTH * DST_HEIGHT);
+    // Offset correction required for the quantized asymmetric computation
+    // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+    T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * _ISRC_CHANNELS * SRC_OFFSET * WEI_OFFSET), c);
 
 #if defined(HAS_BIAS)
-    __global uchar *bias_addr = bia_ptr + bia_offset_first_element_in_bytes + (cout * sizeof(BIA_DATA_TYPE));
+    TILE(BIA_DATA_TYPE, 1, N0, bias0);
 
-    LOAD_BLOCK(1, N0, BIA_DATA_TYPE, bias, bias_addr, 0, zero0, zero);
+    T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 0, bias0);
 
     // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
+    T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+
 #endif // HAS_BIAS
 
-#if defined(IS_QUANTIZED)
+    TILE(uint, M0, 1, dst_indirect_y);
 
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DST_DATA_TYPE, N0), cq, 0);
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, i, 0, M0, 1)
+    {
+        dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+        dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+    }
 
-#if DST_SHIFT < 0
-#define QUANTIZE(i)                                                                               \
-    c##i  = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(c##i, DST_MULTIPLIER, DST_SHIFT, N0); \
-    c##i  = c##i + DST_OFFSET;                                                                    \
-    cq##i = CONVERT_SAT(c##i, VEC_DATA_TYPE(DST_DATA_TYPE, N0));
-#else // OUTPUT_SHIFT < 0
-#define QUANTIZE(i)                                                                            \
-    c##i  = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(c##i, DST_MULTIPLIER, DST_SHIFT, N0); \
-    c##i  = c##i + DST_OFFSET;                                                                 \
-    cq##i = CONVERT_SAT(c##i, VEC_DATA_TYPE(DST_DATA_TYPE, N0));
-#endif // OUTPUT_SHIFT < 0
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
 
-    QUANTIZE(0);
+#if defined(IS_QUANTIZED)
 
-#undef QUANTIZE
+    TILE(DST_DATA_TYPE, M0, N0, cq);
 
-    STORE_VECTOR_SELECT(cq, DST_DATA_TYPE, dst_addr, N0, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0);
-#else  // defined(IS_QUANTIZED)
-    STORE_VECTOR_SELECT(c, DST_DATA_TYPE, dst_addr, N0, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0);
+    // Quantize the tile
+    T_QUANTIZE8_ASYMMETRIC(ACC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
 #endif // defined(IS_QUANTIZED)
+
+    // _IOUTPUT_TILE: c = fp32/fp16, cq=qasymm8
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y);
+
+#undef _IWEI_WIDTH
+#undef _IWEI_HEIGHT
+#undef _ISRC_WIDTH
+#undef _ISRC_HEIGHT
+#undef _ISRC_CHANNELS
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+#undef _IDST_CHANNELS
 }
 \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h
new file mode 100644
index 0000000000..19241cf219
--- /dev/null
+++ b/src/core/CL/cl_kernels/tile_helpers.h
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/** Tile object
+ *  A tile object is a 2D memory block and can be accessed using the following syntax:
+ *  -# a[m0].v    = access the the vector at row "m0" (OpenCL vector)
+ *  -# a[m0].s[x] = access the scalar element at row "m0" and column "n0" (scalar access)
+ *
+ * @param[in] DATA_TYPE Data type of the tile
+ * @param[in] H         Number of tile rows
+ * @param[in] W         Number of tile colums
+ * @param[in] BASENAME  Tile's name
+ */
+#define TILE(DATA_TYPE, H, W, BASENAME) TILE_STR(DATA_TYPE, H, W, BASENAME)
+#define TILE_STR(DATA_TYPE, H, W, BASENAME) \
+    union {                                 \
+        DATA_TYPE    s[W];                  \
+        DATA_TYPE##W v;                     \
+    } BASENAME[H]
+
+#define TENSOR4D_IMAGE(name)             \
+    __read_only image2d_t name##_img,    \
+    __global uchar *name##_ptr,      \
+    uint            name##_stride_x, \
+    uint            name##_step_x,   \
+    uint            name##_stride_y, \
+    uint            name##_step_y,   \
+    uint            name##_stride_z, \
+    uint            name##_step_z,   \
+    uint            name##_stride_w, \
+    uint            name##_step_w,   \
+    uint            name##_offset_first_element_in_bytes
+
+#define TENSOR4D_BUFFER(name)        \
+    __global uchar *name##_ptr,      \
+    uint        name##_stride_x, \
+    uint        name##_step_x,   \
+    uint        name##_stride_y, \
+    uint        name##_step_y,   \
+    uint        name##_stride_z, \
+    uint        name##_step_z,   \
+    uint        name##_stride_w, \
+    uint        name##_step_w,   \
+    uint        name##_offset_first_element_in_bytes
+
+#define TENSOR4D_STR(name, type) TENSOR4D_##type(name)
+#define TENSOR4D(name, type) TENSOR4D_STR(name, type)
+
+/** Loop unrolling */
+#define LOOP_UNROLLING(DATA_TYPE, VAR, START_IDX, NUM_ITERATIONS, STEP) \
+    _Pragma("unroll") for(DATA_TYPE VAR = START_IDX; VAR < NUM_ITERATIONS; VAR += STEP)
+
+/** Get the get_global_id with partial N0. This function is useful when the dimension is not multiple of N0 and we need to use a partial N0
+ *  to avoid out-of-bound read/write
+ *
+ * @note PARTIAL_N0 is used for get_global_id(n) = 0.
+ *
+ * @param[in] IDX        get_global_id index (0,1 and 2 only)
+ * @param[in] N0         Number of elements read/written on the IDX direction
+ * @param[in] PARTIAL_N0 Number of elements read/written on the IDX direction for get_global_id(IDX) = 0. If zero,
+ *                        the Number of elements read/written on the IDX direction for get_global_id(IDX) = 0 is N0
+ */
+#define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0) (max((int)(get_global_id(IDX) * N0 - (N0 - PARTIAL_N0) % N0), 0))
+
+/** Offset (in bytes) calculation for a 1D BUFFER (cl_buffer) tensor */
+#define OFFSET1D(base, data_type, x) (base##_offset_first_element_in_bytes + x * sizeof(data_type))
+
+/** Offset (in bytes) calculation for a 2D BUFFER (cl_buffer) tensor */
+#define OFFSET2D(base, data_type, x, y) (base##_offset_first_element_in_bytes + x * sizeof(data_type) + y * base##_stride_y)
+
+/** Offset (in bytes) calculation for a 3D BUFFER (cl_buffer) tensor */
+#define OFFSET3D(base, data_type, x, y, z) (base##_offset_first_element_in_bytes + x * sizeof(data_type) + y * base##_stride_y + z * base##_stride_z)
+
+/** Offset (in bytes) calculation for a 4D BUFFER (cl_buffer) tensor */
+#define OFFSET4D(base, data_type, x, y, z, w) (base##_offset_first_element_in_bytes + x * sizeof(data_type) + y * base##_stride_y + z * base##_stride_z + w * base##_stride_w)
+
+/** Dot product integet 8bit function
+ *
+ *  @note Performs: c += dot(a, b)
+ *
+ * @param[in] DST_DATA_TYPE Accumulator data type
+ * @param[in] K0            Number of accumulations
+ * @param[in] a             OpenCL vector a
+ * @param[in] b             OpenCL vector b
+ * @param[in] c             Scalar variable c
+ */
+#define DOT_PRODUCT_INTEGER8(DST_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(DST_DATA_TYPE, K0, a, b, c)
+#define DOT_PRODUCT_INTEGER8_STR(DST_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(DST_DATA_TYPE, a, b, c)
+#define DOT_PRODUCT1_INTEGER8(DST_DATA_TYPE, a, b, c) \
+    ({                                                \
+        c += (DST_DATA_TYPE)a * (DST_DATA_TYPE)b;     \
+    })
+#define DOT_PRODUCT2_INTEGER8(DST_DATA_TYPE, a, b, c)   \
+    ({                                                  \
+        c += (DST_DATA_TYPE)a.s0 * (DST_DATA_TYPE)b.s0; \
+        c += (DST_DATA_TYPE)a.s1 * (DST_DATA_TYPE)b.s1; \
+    })
+#define DOT_PRODUCT3_INTEGER8(DST_DATA_TYPE, a, b, c)   \
+    ({                                                  \
+        DOT_PRODUCT2_INTEGER8(DST_DATA_TYPE, a, b, c);  \
+        c += (DST_DATA_TYPE)a.s2 * (DST_DATA_TYPE)b.s2; \
+    })
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define DOT_PRODUCT4_INTEGER8(DST_DATA_TYPE, x, y, val) val = arm_dot_acc((x), (y), (val));
+#elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#define DOT_PRODUCT4_INTEGER8(DST_DATA_TYPE, x, y, val) val += arm_dot((x), (y));
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define DOT_PRODUCT4_INTEGER8(DST_DATA_TYPE, x, y, val)   \
+    ({                                                    \
+        val += (DST_DATA_TYPE)x.s0 * (DST_DATA_TYPE)y.s0; \
+        val += (DST_DATA_TYPE)x.s1 * (DST_DATA_TYPE)y.s1; \
+        val += (DST_DATA_TYPE)x.s2 * (DST_DATA_TYPE)y.s2; \
+        val += (DST_DATA_TYPE)x.s3 * (DST_DATA_TYPE)y.s3; \
+    })
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define DOT_PRODUCT8_INTEGER8(DST_DATA_TYPE, a, b, c) \
+    ({                                                \
+        DOT_PRODUCT4_INTEGER8((a.lo), (b.lo), c);     \
+        DOT_PRODUCT4_INTEGER8((a.hi), (b.hi), c);     \
+    })
+#define DOT_PRODUCT16_INTEGER8(DST_DATA_TYPE, a, b, c) \
+    ({                                                 \
+        DOT_PRODUCT8_INTEGER8((a.lo), (b.lo), c);      \
+        DOT_PRODUCT8_INTEGER8((a.hi), (b.hi), c);      \
+    })
+
+/** Load a vector from global memory (tensor)
+ *
+ * @param[in] DATA_TYPE   Data type
+ * @param[in] WIDTH       Number of dst columns
+ * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
+ *                        In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
+ * @param[in] TENSOR      Tensor basename
+ * @param[in] X           Starting X position
+ * @param[in] Y           Starting Y position
+ * @param[in] STRIDE_Y    Stride Y (in bytes)
+ */
+#define V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y)
+#define V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y)
+#define V_LOAD_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) \
+    VLOAD(WIDTH)                                                \
+    (0, (__global DATA_TYPE *)(TENSOR##_ptr + (X) * sizeof(DATA_TYPE) + (Y)*STRIDE_Y))
+#define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y))
+
+/** Load a tile from global memory (tensor)
+ *
+ * @param[in]  DATA_TYPE   Data type
+ * @param[in]  HEIGHT      Number of dst rows
+ * @param[in]  WIDTH       Number of dst columns
+ * @param[in]  TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
+ *                         In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
+ * @param[in]  TENSOR      Tensor basename
+ * @param[in]  X           Starting X position
+ * @param[in]  Y           Starting Y position
+ * @param[in]  STRIDE_Y    Stride Y (in bytes)
+ * @param[out] dst         Output tile
+ */
+#define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, dst)              \
+    ({ \
+        LOOP_UNROLLING(int, _i, 0, HEIGHT, 1) \
+        { \
+            dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i), STRIDE_Y); \
+        }                                                                                           \
+    })
+
+/** Load a tile from global memory (tensor) using an indirect Y index tile
+ *
+ * @param[in]  DATA_TYPE   Data type
+ * @param[in]  HEIGHT      Number of dst rows
+ * @param[in]  WIDTH       Number of dst columns
+ * @param[in]  TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
+ *                         In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
+ * @param[in]  TENSOR      Tensor basename
+ * @param[in]  X           Starting X position
+ * @param[in]  STRIDE_Y    Stride Y (in bytes)
+ * @param[in]  indirect_y  Indirect Y index tile
+ * @param[out] dst         Output tile
+ */
+#define T_LOAD_INDIRECT(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, STRIDE_Y, indirect_y, dst)    \
+    ({ \
+        LOOP_UNROLLING(int, _i, 0, HEIGHT, 1) \
+        { \
+            dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, (indirect_y[_i].v), STRIDE_Y); \
+        }                                                                                                   \
+    })
+
+/** Store a tile to global memory (tensor) using an indirect Y index tile and conditionally use a different length for the store
+ *
+ * @note If WIDTH1_CONDITION is true, the store will use the WIDTH1 length for the store
+ * @note The vectors are stored in reverse order so the invalid rows are overwritten by the valid ones
+ *
+ * @param[in] DATA_TYPE        Data type
+ * @param[in] HEIGHT           Number of src rows
+ * @param[in] WIDTH0           Store width to use if WIDTH1_CONDITION = false
+ * @param[in] WIDTH1           Store width to use if WIDTH1_CONDITION = true
+ * @param[in] TENSOR_TYPE      Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
+ *                             cl_image is not supported.
+ * @param[in] TENSOR           Tensor basename
+ * @param[in] X                Starting X position
+ * @param[in] STRIDE_Y         Stride Y (in bytes)
+ * @param[in] WIDTH1_CONDITION Condition to select the WIDTH1 store
+ * @param[in] src              Input tile
+ * @param[in] indirect_y       Indirect Y index tile
+ */
+#define T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, src, indirect_y)             \
+    ({                                                                                                                                                    \
+        if(WIDTH1_CONDITION)                                                                                                                              \
+        {                                                                                                                                                 \
+            LOOP_UNROLLING(int, _i, 0, HEIGHT, 1)                                                                                                         \
+            {                                                                                                                                             \
+                VSTORE_PARTIAL(WIDTH0, WIDTH1)                                                                                                            \
+                (src[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
+            }                                                                                                                                             \
+        }                                                                                                                                                 \
+        else                                                                                                                                              \
+        {                                                                                                                                                 \
+            LOOP_UNROLLING(int, _i, 0, HEIGHT, 1)                                                                                                         \
+            {                                                                                                                                             \
+                VSTORE(WIDTH0)                                                                                                                            \
+                (src[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
+            }                                                                                                                                             \
+        }                                                                                                                                                 \
+    })
+
+/** Offset correction for the QASYMM8 computation
+ *
+ * @param[in]  ACC_DATA_TYPE Accumulator data type
+ * @param[in]  M0            Number of src/dst rows
+ * @param[in]  N0            Number of src/dst columns
+ * @param[in]  K0            Number of src columns
+ * @param[in]  SRC_OFFSET    Source quantization offset
+ * @param[in]  WEI_OFFSET    Weights quantization shift
+ * @param[in]  lhs           LHS tile
+ * @param[in]  rhs           RHS tile
+ * @param[out] dst           DST tile
+ */
+#define T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, lhs, rhs, dst)                                                                                                                                                               \
+    ({ \
+        LOOP_UNROLLING(int, _m0, 0, M0, 1) \
+        { \
+            ACC_DATA_TYPE _tm = 0; \
+            LOOP_UNROLLING(int, _k0, 0, K0, 1) \
+            { \
+                _tm += ((ACC_DATA_TYPE)lhs[_m0].s[_k0] * (ACC_DATA_TYPE)WEI_OFFSET); \
+            } \
+            LOOP_UNROLLING(int, _n0, 0, N0, 1) \
+            { \
+                dst[_m0].s[_n0] += _tm; \
+                LOOP_UNROLLING(int, _k0, 0, K0, 1) \
+                { \
+                    dst[_m0].s[_n0] += ((ACC_DATA_TYPE)rhs[_n0].s[_k0] * (ACC_DATA_TYPE)SRC_OFFSET); \
+                } \
+            }                                                                                                                                                                                                                                               \
+        }                                                                                                                                                                                                                                                       \
+    })
+
+/** Quantized the tile (ASYMMETRIC) with fixed-point scale
+ *
+ * @param[in]  SRC_DATA_TYPE  SRC data type
+ * @param[in]  DST_DATA_TYPE  DST data type
+ * @param[in]  M0             Number of src/dst rows
+ * @param[in]  N0             Number of src/dst columns
+ * @param[in]  DST_OFFSET     Quantization offset
+ * @param[in]  DST_SHIFT      Quantization shift
+ * @param[in]  DST_MULTIPLIER Quantization multiplier
+ * @param[in]  src            Input tile
+ * @param[out] dst            Output tile
+ */
+#define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst)                          \
+    ({ \
+        LOOP_UNROLLING(int, _m0, 0, M0, 1) \
+        { \
+            LOOP_UNROLLING(int, _n0, 0, N0, 1) \
+            { \
+                SRC_DATA_TYPE _tmp = 0; \
+                if(DST_SHIFT < 0) \
+                { \
+                    _tmp = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(src[_m0].s[_n0], DST_MULTIPLIER, DST_SHIFT, 1); \
+                } \
+                else \
+                { \
+                    _tmp = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(src[_m0].s[_n0], DST_MULTIPLIER, DST_SHIFT, 1); \
+                } \
+                _tmp += DST_OFFSET; \
+                dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE);                                                                            \
+            }                                                                                                                                          \
+        }                                                                                                                                          \
+    })
+
+/** Conditional rowset (memset by row)
+ *
+ * @note Set the row to VALUE_TO_SET if the corresponding mask == 0
+ *
+ * @param[in]      DATA_TYPE    Data type
+ * @param[in]      M0           Number of LHS rows
+ * @param[in]      N0           Number of LHS columns
+ * @param[in]      VALUE_TO_SET Value to set the row
+ * @param[in, out] a            Input/output tile
+ * @param[out]     mask         Mask to check for setting the row to VALUE_TO_SET
+ */
+#define T_ROWSET_MASK(DATA_TYPE, M0, N0, VALUE_TO_SET, a, mask)                                                                                            \
+    ({ \
+        LOOP_UNROLLING(int, _m0, 0, M0, 1) \
+        { \
+            LOOP_UNROLLING(int, _n0, 0, N0, 1) \
+            { \
+                a[_m0].s[_n0] = select((DATA_TYPE)(a[_m0].s[_n0]), (DATA_TYPE)(VALUE_TO_SET), (SELECT_DATA_TYPE(DATA_TYPE))(mask[_m0].v == (DATA_TYPE)0)); \
+            }                                                                                                                                                      \
+        }                                                                                                                                                      \
+    })
+
+/** Element-wise addition with a constant value
+ *
+ * @note Performs: LHS + constant = DST
+ *
+ * @param[in]  DATA_TYPE    LHS/RHS/DST data type
+ * @param[in]  M0           Number of LHS rows
+ * @param[in]  N0           Number of LHS columns
+ * @param[in]  lhs          LHS tile
+ * @param[in]  rhs_constant Constant value
+ * @param[out] dst          DST tile
+ */
+#define T_ADD_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
+    ({ \
+        LOOP_UNROLLING(int, _m0, 0, M0, 1) \
+        { \
+            LOOP_UNROLLING(int, _n0, 0, N0, 1) \
+            { \
+                dst[_m0].s[_n0] = lhs[_m0].s[_n0] + rhs_constant; \
+            }                                                             \
+        }                                                             \
+    })
+
+/** Element-wise addition with RHS broadcasted (RHS has the X dimension only)
+ *
+ * @note Performs: LHS + RHS[broadcasted] = DST
+ * @note Both tiles must have same data type
+ *
+ * @param[in]  DATA_TYPE LHS/RHS/DST data type
+ * @param[in]  M0        Number of LHS rows
+ * @param[in]  N0        Number of LHS columns
+ * @param[in]  lhs       LHS tile
+ * @param[in]  rhs       RHS tile
+ * @param[out] dst       DST tile
+ */
+#define T_ADD_BROADCAST_X(DATA_TYPE, M0, N0, lhs, rhs, dst) \
+    ({ \
+        LOOP_UNROLLING(int, _m0, 0, M0, 1) \
+        { \
+            dst[_m0].v = lhs[_m0].v + rhs[0].v;             \
+        }                                                       \
+    })
+
+/** Matrix multiplication
+ *
+ * @note Performs: LHS X RHS + DST = DST
+ *
+ * @param[in]      LHS_DATA_TYPE LHS tile data type
+ * @param[in]      RHS_DATA_TYPE RHS tile data type
+ * @param[in]      DST_DATA_TYPE RHS tile data type
+ * @param[in]      M0            Number of LHS rows
+ * @param[in]      N0            Number of RHS columns
+ * @param[in]      K0            Number of LHS columns
+ * @param[in]      LHS_LAYOUT    LHS layout (T= transposed, NT= not transposed)
+ * @param[in]      RHS_LAYOUT    RHS layout (T= transposed, NT= not transposed)
+ * @param[in]      lhs           LHS tile
+ * @param[in]      rhs           RHS tile
+ * @param[in, out] dst           DST tile
+ */
+#define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst) T_MMUL_##LHS_LAYOUT##_##RHS_LAYOUT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_T_float_float_float(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_T_half_half_half(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_T_char_char_int(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_T_uchar_uchar_uint(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_T_uchar_uchar_int(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
+#define T_MMUL_NT_T_FLOAT(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                       \
+    {                                                                                     \
+        LOOP_UNROLLING(int, _m, 0, M0, 1)                                                 \
+        {                                                                                 \
+            LOOP_UNROLLING(int, _n, 0, N0, 1)                                             \
+            {                                                                             \
+                LOOP_UNROLLING(int, _k, 0, K0, 1)                                         \
+                {                                                                         \
+                    dst[_m].s[_n] = fma((lhs[_m].s[_k]), (rhs[_n].s[_k]), dst[_m].s[_n]); \
+                }                                                                         \
+            }                                                                             \
+        }                                                                                 \
+    }
+#define T_MMUL_NT_T_INTEGER8(DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)                            \
+    ({ \
+        LOOP_UNROLLING(int, _m, 0, M0, 1) \
+        { \
+            LOOP_UNROLLING(int, _n, 0, N0, 1) \
+            { \
+                DOT_PRODUCT_INTEGER8(DST_DATA_TYPE, K0, (lhs[_m].v), (rhs[_n].v), dst[_m].s[_n]); \
+            }                                                                                             \
+        }                                                                                             \
+    })
+\ No newline at end of file
diff --git a/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp b/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp
index f071dbc468..72801fa6c8 100644
--- a/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp
@@ -276,10 +276,12 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
     if(data_layout == DataLayout::NHWC)
     {
         const unsigned int vec_size = std::min(static_cast<unsigned int>(dst->tensor_shape()[0]), 4u);
+        const unsigned int num_rows = dst->tensor_shape()[0] > 16 ? 2u : 1U;
 
         // Create window and update padding
-        Window win = calculate_max_window(*dst, Steps(vec_size, 1U));
+        Window win = calculate_max_window(*dst, Steps(vec_size, num_rows));
         dst->set_valid_region(ValidRegion(Coordinates(), dst->tensor_shape()));
+
         Status err = Status{};
         return std::make_pair(err, win);
     }
@@ -368,9 +370,9 @@ void ClDirectConvolutionKernel::configure(const CLCompileContext &compile_contex
 
         const unsigned int n0               = win_config.second.x().step();
         const unsigned int m0               = win_config.second.y().step();
-        const unsigned int k0               = adjust_vec_size(16u, src->dimension(channel_idx));
+
+        const unsigned int k0               = adjust_vec_size(8u, src->dimension(channel_idx));
         const unsigned int partial_store_n0 = dst->dimension(channel_idx) % n0;
-        const unsigned int partial_store_m0 = (dst->dimension(width_idx) * dst->dimension(height_idx)) % m0;
         const unsigned int pad_left         = conv_info.pad_left();
         const unsigned int pad_top          = conv_info.pad_top();
 
@@ -379,14 +381,19 @@ void ClDirectConvolutionKernel::configure(const CLCompileContext &compile_contex
             build_options.add_option(std::string("-DHAS_BIAS"));
             build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
         }
+
+        build_options.add_option("-cl-fast-relaxed-math");
+        build_options.add_option("-DSRC_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(width_idx)));
         build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(height_idx)));
         build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(channel_idx)));
         build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+        build_options.add_option("-DDST_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(width_idx)));
         build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(height_idx)));
         build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(channel_idx)));
         build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+        build_options.add_option("-DWEI_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx)));
         build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx)));
         build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type()));
@@ -397,8 +404,7 @@ void ClDirectConvolutionKernel::configure(const CLCompileContext &compile_contex
         build_options.add_option("-DN0=" + support::cpp11::to_string(n0));
         build_options.add_option("-DM0=" + support::cpp11::to_string(m0));
         build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
-        build_options.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-        build_options.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+        build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
 
         if(is_data_type_quantized(data_type))
         {
@@ -426,6 +432,7 @@ void ClDirectConvolutionKernel::configure(const CLCompileContext &compile_contex
         else
         {
             build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(data_type));
+            build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0));
             build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0));
             build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0));
             build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0));
@@ -529,18 +536,18 @@ void ClDirectConvolutionKernel::run_op(ITensorPack &tensors, const Window &windo
 
     if(_data_layout == DataLayout::NHWC)
     {
-        slice.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(1) * dst->info()->dimension(2), 1));
+        const size_t dim_y_collapsed = ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2), slice.y().step());
+        slice.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, slice.y().step()));
         slice.set(Window::DimZ, Window::Dimension(0, dst->info()->dimension(3), 1));
 
         unsigned int idx = 0;
-        add_3D_tensor_argument(idx, src, slice);
-        add_3D_tensor_argument(idx, dst, slice);
-        add_3D_tensor_argument(idx, weights, slice);
+        add_4D_tensor_argument(idx, src, slice);
+        add_4D_tensor_argument(idx, dst, slice);
+        add_4D_tensor_argument(idx, weights, slice);
         if(biases != nullptr)
         {
             add_1D_tensor_argument(idx, biases, slice);
         }
-        _kernel.setArg(idx++, static_cast<unsigned int>(weights->info()->strides_in_bytes()[3]));
         enqueue(queue, *this, slice, lws_hint());
     }
     else
author	Gian Marco Iodice <gianmarco.iodice@arm.com>	2021-03-19 11:26:20 +0000
committer	Gian Marco Iodice <gianmarco.iodice@arm.com>	2021-03-23 14:16:34 +0000
commit	5c9eed82102b29381cfcf36a0e98d3c5239f37e5 (patch)
tree	fdfcbe642f4d7896e477583d74f81e7e30271465 /src
parent	1efed925da927cc47bff6327c66f252b65c660bc (diff)
download	ComputeLibrary-5c9eed82102b29381cfcf36a0e98d3c5239f37e5.tar.gz