From 09849a0e7128731473f37cf6045147db68b1c495 Mon Sep 17 00:00:00 2001
From: Joel Liang <joel.liang@arm.com>
Date: Fri, 5 Jan 2018 15:12:53 +0800
Subject: APPBROWSER-372: Rewrite the direct_convolution5x5.cs with the new
 common code

Change-Id: Ie2f398d62dea97e9201f77d22c9f0796db297b63
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/115280
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Zhenglin Li <zhenglin.li@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
---
 .../cs_shaders/direct_convolution5x5.cs            | 982 ++-------------------
 src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h      |  35 +-
 2 files changed, 116 insertions(+), 901 deletions(-)

(limited to 'src/core/GLES_COMPUTE')

diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
index a36bd438ff..c919e4ed80 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017, 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,488 +24,114 @@
 
 layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
 
-#include "helpers.h"
+#include "helpers_cs.h"
 
-#ifdef DATA_TYPE_FP32
-
-precision highp float;
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif // DATA_TYPE_FP16
 
 /** This kernel performs a direct convolution to convolve the low three dimensions
  *
- * @note This OpenGL ES shader works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note This kernel has multiple optimized direct convolution options for FP16.
+ *       The direct convolution option must be passed at compile time using "#define PROCESS_nX_nY_nZ" e.g. "#define PROCESS_8X_1Y_1Z"
+ * @note The convolution stride x must be passed at compile time using "#define STRIDE_X n" e.g. "#define STRIDE_X 1"
+ *       This OpenGL ES shader works with stride_x = 1 and 2
  * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
  *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
+ * @param[in]  src_ptr          Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_attrs        The attributes of the source tensor
+ * @param[out] dst_ptr          Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_attrs        The attributes of the destination tensor
+ * @param[out] weights_ptr      Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_attrs    The attributes of the weights tensor
+ * @param[in]  biases_ptr       Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_attrs     The attributes of the weights tensor
+ * @param[in]  weights_stride_w Stride of the weights tensor in the 4th dimension
+ * @param[in]  weights_depth    The third dimensions of the weights tensors
  */
-
-layout(std140) uniform shader_params
+SHADER_PARAMS_DECLARATION
 {
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(dst);
-    TENSOR3D_PARAM_DECLARATION(weights);
+    Tensor3DAttributes src_attrs;
+    Tensor3DAttributes dst_attrs;
+    Tensor3DAttributes weights_attrs;
 #ifdef BIAS
-    VECTOR_PARAM_DECLARATION(biases);
+    VectorAttributes biases_attrs;
 #endif /* BIAS */
     uint weights_stride_w;
     uint weights_depth;
 };
 
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, writeonly);
-BUFFER_DECLARATION(weights, 3, float, readonly);
+#ifdef DATA_TYPE_FP32
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
 #ifdef BIAS
-BUFFER_DECLARATION(biases, 4, float, readonly);
+TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
 #endif /* BIAS */
 
-#define LOAD20(r, name, offset)           \
-    r[0] = LOAD4(name, offset);           \
-    r[1] = LOAD4(name, offset + uint(1)); \
-    r[2] = LOAD4(name, offset + uint(2)); \
-    r[3] = LOAD4(name, offset + uint(3)); \
-    r[4] = LOAD4(name, offset + uint(4))
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
 void main()
 {
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
 #ifdef BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+    VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
 #endif /* BIAS */
 
-    float pixels  = CONVERT(0, float);
+    float pixels  = 0.f;
     uint  z_index = gl_GlobalInvocationID.z;
-    weights.current_offset += z_index * weights_stride_w >> 2;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
+
     float temp[5];
     float temp_weight[5];
-
     for(int d = 0; d < int(weights_depth); ++d)
     {
-        LOAD20(temp, src, offset(src, 0, 0));
-        LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 0, 0));
+        temp        = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 0));
+        temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0));
         pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
 
-        LOAD20(temp, src, offset(src, 0, 1));
-        LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 1, 0));
+        temp        = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
+        temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
         pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
 
-        LOAD20(temp, src, offset(src, 0, 2));
-        LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 2, 0));
+        temp        = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
+        temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
         pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
 
-        LOAD20(temp, src, offset(src, 0, 3));
-        LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 3, 0));
+        temp        = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
+        temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 3, 0));
         pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
 
-        LOAD20(temp, src, offset(src, 0, 4));
-        LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 4, 0));
+        temp        = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 4));
+        temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 4, 0));
         pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
 
-        src.current_offset += (src_stride_z >> 2);
-        weights.current_offset += (weights_stride_z >> 2);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
     }
 
 #ifdef BIAS
-    pixels += LOAD4(biases, vector_offset(biases, int(z_index)));
+    pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
 #endif /* BIAS */
 
-    STORE4(dst, CURRENT_OFFSET(dst), pixels);
+    STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
 }
-
 #elif defined(DATA_TYPE_FP16)
 
-precision mediump float;
-
-#if defined(PROCESS_4X_1Y_1Z)
-
-/** This kernel performs a direct convolution to convolve the low three dimensions
- *
- * @note This OpenGL ES shader works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
-
-layout(std140) uniform shader_params
-{
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(dst);
-    TENSOR3D_PARAM_DECLARATION(weights);
-#ifdef BIAS
-    VECTOR_PARAM_DECLARATION(biases);
-#endif /* BIAS */
-    uint weights_stride_w;
-    uint weights_depth;
-};
-
-BUFFER_DECLARATION(src, 1, uvec2, readonly);
-BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
-BUFFER_DECLARATION(weights, 3, uint, readonly);
-#ifdef BIAS
-BUFFER_DECLARATION(biases, 4, uint, readonly);
-#endif /* BIAS */
-
-#if STRIDE_X == 1
-#define LOAD_SRC(src, row) load_src_stride1(src, row)
-#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define LOAD_SRC(src, row) load_src_stride2(src, row)
-#define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight)
-#else /* STRDIDE_X == 1 */
-#error STRIDE_X larger than 2 is not supported
-#endif /* STRIDE_X == 1 */
-
-vec4[2] load_src_stride1(Image src, int row)
-{
-    uvec2 packed[2];
-    vec4  ret[2];
-
-    GC_LOAD2_2D_OFFSET(packed, src, 0, row);
-
-    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
-    ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
-
-    return ret;
-}
-
-vec4[3] load_src_stride2(Image src, int row)
-{
-    uvec2 packed[3];
-    vec4  ret[3];
-
-    GC_LOAD3_2D_OFFSET(packed, src, 0, row);
-
-    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
-    ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
-    ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y));
-
-    return ret;
-}
-
-vec2[3] load_weight(Tensor3D weights, int row)
-{
-    uvec3 packed_w;
-    vec2  ret[3];
-
-    GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
-
-    ret[0] = vec2(unpackHalf2x16(packed_w[0]));
-    ret[1] = vec2(unpackHalf2x16(packed_w[1]));
-    ret[2] = vec2(unpackHalf2x16(packed_w[2]));
-
-    return ret;
-}
-
-vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3])
-{
-    vec4 src0 = tmp[0];
-    vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
-    vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
-    vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
-    vec4 src4 = tmp[1];
-    vec4 ret  = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
-
-    return ret;
-}
-
-vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3])
-{
-    vec4 src0 = vec4(tmp[0].xz, tmp[1].xz);
-    vec4 src1 = vec4(tmp[0].yw, tmp[1].yw);
-    vec4 src2 = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
-    vec4 src3 = vec4(tmp[0].w, tmp[1].yw, tmp[2].y);
-    vec4 src4 = vec4(tmp[1].x, tmp[1].z, tmp[2].xz);
-    vec4 ret  = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
-
-    return ret;
-}
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
-void main()
-{
-    Image    src     = GC_CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-#ifdef BIAS
-    Vector   biases  = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-#endif /* BIAS */
-
-    vec4  res = vec4(0);
-    vec2  w[3];
-    vec4  s[STRIDE_X + 1];
-    uvec2 packed_d;
-    uint  z_index = gl_GlobalInvocationID.z;
-
-    weights.current_offset += z_index * weights_stride_w;
-
-    for(int d = 0; d < int(weights_depth); ++d)
-    {
-        for(int row = 0; row < 5; row++)
-        {
-            w = load_weight(weights, row);
-            s = LOAD_SRC(src, row);
-            res += CONVOLVE1x5(s, w);
-        }
-
-        src.current_offset += src_stride_z;
-        weights.current_offset += weights_stride_z;
-    }
-
-#ifdef BIAS
-    uint  packed_b;
-    float b;
-
-    GC_LOAD1_1D_OFFSET(packed_b, biases, z_index);
-    b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
-    res += vec4(b);
-#endif /* BIAS */
-
-    packed_d = uvec2(packHalf2x16(res.xy), packHalf2x16(res.zw));
-    GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0);
-}
-
-#elif defined(PROCESS_4X_3Y_1Z)
-
-/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 3 elements @ Y at once
- *
- * @note This OpenGL ES shader works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
-
-layout(std140) uniform shader_params
-{
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(dst);
-    TENSOR3D_PARAM_DECLARATION(weights);
-#ifdef BIAS
-    VECTOR_PARAM_DECLARATION(biases);
-#endif /* BIAS */
-    uint weights_stride_w;
-    uint weights_depth;
-};
-
-BUFFER_DECLARATION(src, 1, uvec2, readonly);
-BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
-BUFFER_DECLARATION(weights, 3, uint, readonly);
-#ifdef BIAS
-BUFFER_DECLARATION(bias, 4, uint, readonly);
-#endif /* BIAS */
-
+// Common definitions for DATA_TYPE_FP16
 #if STRIDE_X == 1
-#define LOAD_SRC(src, row) load_src_stride1(src, row)
+#define LOAD_SRC_AT_ROW(row) VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, row))
 #define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
 #elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define LOAD_SRC(src, row) load_src_stride2(src, row)
+#define LOAD_SRC_AT_ROW(row) VLOAD3_UNPACK12_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, row))
 #define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight)
 #else /* STRDIDE_X == 1 */
 #error STRIDE_X larger than 2 is not supported
 #endif /* STRIDE_X == 1 */
 
-vec4[2] load_src_stride1(Image src, int row)
-{
-    uvec2 packed[2];
-    vec4  ret[2];
-
-    GC_LOAD2_2D_OFFSET(packed, src, 0, row);
-
-    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
-    ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
-
-    return ret;
-}
-
-vec4[3] load_src_stride2(Image src, int row)
-{
-    uvec2 packed[3];
-    vec4  ret[3];
-
-    GC_LOAD3_2D_OFFSET(packed, src, 0, row);
-
-    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
-    ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
-    ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y));
-
-    return ret;
-}
-
-vec2[3] load_weight(Tensor3D weights, int row)
-{
-    uvec3 packed_w;
-    vec2  ret[3];
-
-    GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
-
-    ret[0] = vec2(unpackHalf2x16(packed_w[0]));
-    ret[1] = vec2(unpackHalf2x16(packed_w[1]));
-    ret[2] = vec2(unpackHalf2x16(packed_w[2]));
-
-    return ret;
-}
+#define LOAD_WEIGHT_AT_ROW(row) VLOAD3_UNPACK6_HALF(weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, row, 0))
 
 vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3])
 {
@@ -531,501 +157,57 @@ vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3])
     return ret;
 }
 
-void main()
-{
-    Image    src     = GC_CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-#ifdef BIAS
-    Vector   bias    = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-#endif /* BIAS */
-
-    vec4  res[3];
-    vec2  w[5][3];
-    vec4  s[STRIDE_X + 1];
-    uvec2 packed_d;
-    uint  z_index = gl_GlobalInvocationID.z;
-    int   i;
-
-    for(i = 0; i < 3; i++)
-    {
-        res[i] = vec4(0);
-    }
-
-    weights.current_offset += z_index * weights_stride_w;
-
-    for(int d = 0; d < int(weights_depth); ++d)
-    {
-        // load weights once
-        for(int row = 0; row < 5; row++)
-        {
-            w[row] = load_weight(weights, row);
-        }
-
-        // 1st line
-        s = LOAD_SRC(src, 0);
-        res[0] += CONVOLVE1x5(s, w[0]);
-
-        // 2nd line
-        s = LOAD_SRC(src, 1);
-        res[0] += CONVOLVE1x5(s, w[1]);
-        res[1] += CONVOLVE1x5(s, w[0]);
-
-        // 3rd line
-        s = LOAD_SRC(src, 2);
-        res[0] += CONVOLVE1x5(s, w[2]);
-        res[1] += CONVOLVE1x5(s, w[1]);
-        res[2] += CONVOLVE1x5(s, w[0]);
-
-        // 4th line
-        s = LOAD_SRC(src, 3);
-        res[0] += CONVOLVE1x5(s, w[3]);
-        res[1] += CONVOLVE1x5(s, w[2]);
-        res[2] += CONVOLVE1x5(s, w[1]);
-
-        // 5th line
-        s = LOAD_SRC(src, 4);
-        res[0] += CONVOLVE1x5(s, w[4]);
-        res[1] += CONVOLVE1x5(s, w[3]);
-        res[2] += CONVOLVE1x5(s, w[2]);
-
-        // 6th line
-        s = LOAD_SRC(src, 5);
-        res[1] += CONVOLVE1x5(s, w[4]);
-        res[2] += CONVOLVE1x5(s, w[3]);
-
-        // 7th line
-        s = LOAD_SRC(src, 6);
-        res[2] += CONVOLVE1x5(s, w[4]);
-
-        src.current_offset += src_stride_z;
-        weights.current_offset += weights_stride_z;
-    }
-
-#ifdef BIAS
-    uint  packed_b;
-    float b;
-
-    GC_LOAD1_1D_OFFSET(packed_b, bias, z_index);
-    b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
-    for(i = 0; i < 3; i++)
-    {
-        res[i] += vec4(b);
-    }
-#endif /* BIAS */
-
-    for(i = 0; i < 3; i++)
-    {
-        packed_d = uvec2(packHalf2x16(res[i].xy), packHalf2x16(res[i].zw));
-        GC_STORE1_3D_OFFSET(packed_d, dst, 0, i, 0);
-    }
-}
-
-#elif defined(PROCESS_4X_3Y_2Z)
-
-/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 3 elements @ Y and 2 elements @ Z at once
- *
- * @note This OpenGL ES shader works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
-
-layout(std140) uniform shader_params
-{
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(dst);
-    TENSOR3D_PARAM_DECLARATION(weights);
-#ifdef BIAS
-    VECTOR_PARAM_DECLARATION(biases);
-#endif /* BIAS */
-    uint weights_stride_w;
-    uint weights_depth;
-};
-
-BUFFER_DECLARATION(src, 1, uvec2, readonly);
-BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
-BUFFER_DECLARATION(weights, 3, uint, readonly);
-#ifdef BIAS
-BUFFER_DECLARATION(bias, 4, uint, readonly);
-#endif /* BIAS */
-
-#if STRIDE_X == 1
-#define LOAD_SRC(src, row) load_src_stride1(src, row)
-#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define LOAD_SRC(src, row) load_src_stride2(src, row)
-#define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight)
-#else /* STRDIDE_X == 1 */
-#error STRIDE_X larger than 2 is not supported
-#endif /* STRIDE_X == 1 */
-
-vec4[2] load_src_stride1(Image src, int row)
-{
-    uvec2 packed[2];
-    vec4  ret[2];
-
-    GC_LOAD2_2D_OFFSET(packed, src, 0, row);
-
-    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
-    ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
-
-    return ret;
-}
-
-vec4[3] load_src_stride2(Image src, int row)
-{
-    uvec2 packed[3];
-    vec4  ret[3];
-
-    GC_LOAD3_2D_OFFSET(packed, src, 0, row);
-
-    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
-    ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
-    ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y));
-
-    return ret;
-}
-
-vec2[3] load_weight(Tensor3D weights, int row)
-{
-    uvec3 packed_w;
-    vec2  ret[3];
-
-    GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
-
-    ret[0] = vec2(unpackHalf2x16(packed_w[0]));
-    ret[1] = vec2(unpackHalf2x16(packed_w[1]));
-    ret[2] = vec2(unpackHalf2x16(packed_w[2]));
-
-    return ret;
-}
-
-vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3])
-{
-    vec4 src0 = tmp[0];
-    vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
-    vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
-    vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
-    vec4 src4 = tmp[1];
-    vec4 ret  = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
-
-    return ret;
-}
-
-vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3])
-{
-    vec4 src0 = vec4(tmp[0].xz, tmp[1].xz);
-    vec4 src1 = vec4(tmp[0].yw, tmp[1].yw);
-    vec4 src2 = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
-    vec4 src3 = vec4(tmp[0].w, tmp[1].yw, tmp[2].y);
-    vec4 src4 = vec4(tmp[1].x, tmp[1].z, tmp[2].xz);
-    vec4 ret  = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
-
-    return ret;
-}
-
-void main()
-{
-    Image    src     = GC_CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-#ifdef BIAS
-    Vector   bias    = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-#endif /* BIAS */
-
-    vec4  res[3];
-    vec2  w[5][3];
-    vec4  s[STRIDE_X + 1];
-    uvec2 packed_d;
-    uint  z_index  = (gl_GlobalInvocationID.z);
-    uint  s_offset = src.current_offset;
-    int   i, z;
-
-    weights.current_offset += z_index * weights_stride_w;
-
-    for(z = 0; z < 2; z++)
-    {
-        z_index += uint(z);
-        src.current_offset = s_offset;
-
-        for(i = 0; i < 3; i++)
-        {
-            res[i] = vec4(0);
-        }
-
-        for(int d = 0; d < int(weights_depth); ++d)
-        {
-            // load weights once
-            for(int row = 0; row < 5; row++)
-            {
-                w[row] = load_weight(weights, row);
-            }
-
-            // 1st line
-            s = LOAD_SRC(src, 0);
-            res[0] += CONVOLVE1x5(s, w[0]);
-
-            // 2nd line
-            s = LOAD_SRC(src, 1);
-            res[0] += CONVOLVE1x5(s, w[1]);
-            res[1] += CONVOLVE1x5(s, w[0]);
-
-            // 3rd line
-            s = LOAD_SRC(src, 2);
-            res[0] += CONVOLVE1x5(s, w[2]);
-            res[1] += CONVOLVE1x5(s, w[1]);
-            res[2] += CONVOLVE1x5(s, w[0]);
-
-            // 4th line
-            s = LOAD_SRC(src, 3);
-            res[0] += CONVOLVE1x5(s, w[3]);
-            res[1] += CONVOLVE1x5(s, w[2]);
-            res[2] += CONVOLVE1x5(s, w[1]);
-
-            // 5th line
-            s = LOAD_SRC(src, 4);
-            res[0] += CONVOLVE1x5(s, w[4]);
-            res[1] += CONVOLVE1x5(s, w[3]);
-            res[2] += CONVOLVE1x5(s, w[2]);
-
-            // 6th line
-            s = LOAD_SRC(src, 5);
-            res[1] += CONVOLVE1x5(s, w[4]);
-            res[2] += CONVOLVE1x5(s, w[3]);
-
-            // 7th line
-            s = LOAD_SRC(src, 6);
-            res[2] += CONVOLVE1x5(s, w[4]);
-
-            src.current_offset += src_stride_z;
-            weights.current_offset += weights_stride_z;
-        }
-
-#ifdef BIAS
-        uint  packed_b;
-        float b;
-
-        GC_LOAD1_1D_OFFSET(packed_b, bias, z_index);
-        b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
-        for(i = 0; i < 3; i++)
-        {
-            res[i] += vec4(b);
-        }
-#endif /* BIAS */
-
-        for(i = 0; i < 3; i++)
-        {
-            packed_d = uvec2(packHalf2x16(res[i].xy), packHalf2x16(res[i].zw));
-            GC_STORE1_3D_OFFSET(packed_d, dst, 0, i, 0);
-        }
-
-        dst.current_offset += dst_stride_z;
-    }
-}
-
-#elif defined(PROCESS_8X_1Y_1Z)
-
-/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 8 elements @ X at once
- *
- * @note This OpenGL ES shader works with stride_x = 1
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[out] weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth                         The third dimensions of the weights tensors
- */
-
-layout(std140) uniform shader_params
-{
-    TENSOR3D_PARAM_DECLARATION(src);
-    TENSOR3D_PARAM_DECLARATION(dst);
-    TENSOR3D_PARAM_DECLARATION(weights);
-#ifdef BIAS
-    VECTOR_PARAM_DECLARATION(biases);
-#endif /* BIAS */
-    uint weights_stride_w;
-    uint weights_depth;
-};
-
-BUFFER_DECLARATION(src, 1, uvec4, readonly);
-BUFFER_DECLARATION(dst, 2, uvec4, writeonly);
-BUFFER_DECLARATION(weights, 3, uint, readonly);
+#if defined(PROCESS_4X_1Y_1Z)
+TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
 #ifdef BIAS
-BUFFER_DECLARATION(bias, 4, uint, readonly);
+TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
 #endif /* BIAS */
 
-#if STRIDE_X == 1
-#define LOAD_SRC(src, row) load_src_stride1(src, row)
-#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#error stride == 2 for PROCESS_8X_1Y not implemented
-#else /* STRDIDE_X == 1 */
-#error STRIDE_X larger than 2 is not supported
-#endif /* STRIDE_X == 1 */
-
-vec4[3] load_src_stride1(Image src, int row)
-{
-    uvec4 packed[2];
-    vec4  ret[3];
-
-    GC_LOAD2_2D_OFFSET(packed, src, 0, row);
-
-    ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y));
-    ret[1] = vec4(unpackHalf2x16(packed[0].z), unpackHalf2x16(packed[0].w));
-    ret[2] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y));
-
-    return ret;
-}
-
-vec2[3] load_weight(Tensor3D weights, int row)
-{
-    uvec3 packed_w;
-    vec2  ret[3];
-
-    GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0);
-
-    ret[0] = vec2(unpackHalf2x16(packed_w[0]));
-    ret[1] = vec2(unpackHalf2x16(packed_w[1]));
-    ret[2] = vec2(unpackHalf2x16(packed_w[2]));
-
-    return ret;
-}
-
-vec4[2] convolve1x5_stride1(vec4 tmp[3], vec2 w[3])
-{
-    vec4 src0 = tmp[0];
-    vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
-    vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
-    vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
-    vec4 src4 = tmp[1];
-    vec4 ret[2];
-
-    ret[0] = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
-
-    src0   = tmp[1];
-    src1   = vec4(tmp[1].yzw, tmp[2].x);
-    src2   = vec4(tmp[1].zw, tmp[2].xy);
-    src3   = vec4(tmp[1].w, tmp[2].xyz);
-    src4   = tmp[2];
-    ret[1] = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
-
-    return ret;
-}
-
 void main()
 {
-    Image    src     = GC_CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
 
 #ifdef BIAS
-    Vector   bias    = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
 #endif /* BIAS */
 
-    vec4  res[2];
-    vec2  w[3];
-    vec4  s[STRIDE_X + 2];
-    uvec4 packed_d;
-    uint  z_index = gl_GlobalInvocationID.z;
+    vec4 res = vec4(0);
+    vec2 w[3];
+    vec4 s[STRIDE_X + 1];
 
-    res[0] = vec4(0);
-    res[1] = vec4(0);
-    weights.current_offset += z_index * weights_stride_w;
+    uint z_index = gl_GlobalInvocationID.z;
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
 
     for(int d = 0; d < int(weights_depth); ++d)
     {
         for(int row = 0; row < 5; row++)
         {
-            w = load_weight(weights, row);
-            s = LOAD_SRC(src, row);
-            res[0] += CONVOLVE1x5(s, w)[0];
-            res[1] += CONVOLVE1x5(s, w)[1];
+            w = LOAD_WEIGHT_AT_ROW(row);
+            s = LOAD_SRC_AT_ROW(row);
+            res += CONVOLVE1x5(s, w);
         }
 
-        src.current_offset += src_stride_z;
-        weights.current_offset += weights_stride_z;
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
     }
 
 #ifdef BIAS
-    uint  packed_b;
+    vec2  vec2_b;
     float b;
 
-    GC_LOAD1_1D_OFFSET(packed_b, bias, z_index);
-    b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y;
-    res[0] += vec4(b);
-    res[1] += vec4(b);
+    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
+    b      = (z_index % uint(2) == uint(0)) ? vec2_b.x : vec2_b.y;
+    res += vec4(b);
 #endif /* BIAS */
 
-    packed_d.xy = uvec2(packHalf2x16(res[0].xy), packHalf2x16(res[0].zw));
-    packed_d.zw = uvec2(packHalf2x16(res[1].xy), packHalf2x16(res[1].zw));
-    GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0);
+    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, res);
 }
 
-#else /* defined(PROCESS_4X_1Y_1Z) */
-
-#endif /* defined(PROCESS_4X_1Y_1Z) */
-
-#else /* DATA_TYPE_FP16 */
+#endif /* PROCESS_nX_nY_nZ */
+#else  /* DATA_TYPE_FP32 */
 #error Data type not supported
-#endif /* DATA_TYPE_FP16 */
+#endif /* DATA_TYPE_FP32 */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h b/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h
index fffc87d90d..dd9e1a3864 100755
--- a/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h
+++ b/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017, 2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -326,6 +326,23 @@ uint tensor3D_offset_in_bytes(Tensor3DIterator tensor_iter, int x, int y, int z)
 #define VLOAD4_CURRENT_ITEM(return_type, tensor_ptr, tensor_iter) VLOAD4(return_type, tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
 #define VSTORE4_CURRENT_ITEM(tensor_ptr, tensor_iter, data) VSTORE4(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
 
+#define VLOAD5(return_type, tensor_ptr, offset)       \
+    return_type(LOAD(tensor_ptr, offset),             \
+                LOAD(tensor_ptr, (offset) + uint(1)), \
+                LOAD(tensor_ptr, (offset) + uint(2)), \
+                LOAD(tensor_ptr, (offset) + uint(3)), \
+                LOAD(tensor_ptr, (offset) + uint(4)))
+
+#define VSTORE5(tensor_ptr, offset, data)           \
+    STORE(tensor_ptr, offset, data[0]);             \
+    STORE(tensor_ptr, (offset) + uint(1), data[1]); \
+    STORE(tensor_ptr, (offset) + uint(2), data[2]); \
+    STORE(tensor_ptr, (offset) + uint(3), data[3]); \
+    STORE(tensor_ptr, (offset) + uint(4), data[4])
+
+#define VLOAD5_CURRENT_ITEM(return_type, tensor_ptr, tensor_iter) VLOAD5(return_type, tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define VSTORE5_CURRENT_ITEM(tensor_ptr, tensor_iter, data) VSTORE5(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
 /** Converting the vec4 object to 4 half-precision (16-bits) floating point values and packing into a uvec2 object
  *
  * @param[in] data The vec4 object to be packed
@@ -348,6 +365,19 @@ mediump vec4 unpack4_half(highp uvec2 packed_data)
     return vec4(unpackHalf2x16(packed_data.x), unpackHalf2x16(packed_data.y));
 }
 
+/** Unpacking the uvec3 object to 6 half-precision (16-bits) floating point values and converting to a vec2[3] object
+ *
+ * @param[in] packed_data The uvec3 object to be unpacked
+ *
+ * @return The unpacked vec2[3] object
+ */
+mediump vec2[3] unpack6_half(highp uvec3 packed_data)
+{
+    return vec2[3](unpackHalf2x16(packed_data[0]),
+                   unpackHalf2x16(packed_data[1]),
+                   unpackHalf2x16(packed_data[2]));
+}
+
 /** Converting the vec4[2] object to 8 half-precision (16-bits) floating point values and packing into a uvec4 object
  *
  * @param[in] data The vec4[2] object to be packed
@@ -396,6 +426,9 @@ mediump vec4[3] unpack12_half(highp uvec2[3] packed_data)
 #define VLOAD2_UNPACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD2_UNPACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
 #define VSTORE2_PACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) VSTORE2_PACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
 
+#define VLOAD3_UNPACK6_HALF(tensor_ptr, offset) unpack6_half(VLOAD3(uvec3, tensor_ptr, offset))
+#define VLOAD3_UNPACK6_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD3_UNPACK6_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+
 #define VLOAD4_UNPACK8_HALF(tensor_ptr, offset) unpack8_half(VLOAD4(uvec4, tensor_ptr, offset))
 #define VSTORE4_PACK8_HALF(tensor_ptr, offset, data) VSTORE4(tensor_ptr, offset, pack8_half(data))
 #define VLOAD4_UNPACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD4_UNPACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
-- 
cgit v1.2.1